Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f9f8d9cf2f |
@ -260,8 +260,10 @@ public class LayoutParsingPipeline {
|
|||||||
case REDACT_MANAGER_OLD ->
|
case REDACT_MANAGER_OLD ->
|
||||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
docstrumBlockificationService.blockify(layoutParsingType, stripper.getTextPositionSequences(), emptyTableCells, true, cleanRulings);
|
||||||
|
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||||
|
docstrumBlockificationService.blockify(layoutParsingType, stripper.getTextPositionSequences(), emptyTableCells, false, cleanRulings);
|
||||||
};
|
};
|
||||||
|
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
|
|||||||
@ -5,13 +5,12 @@ import static java.util.stream.Collectors.toSet;
|
|||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ListIterator;
|
import java.util.ListIterator;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||||
@ -19,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
@ -38,7 +38,11 @@ public class DocstrumBlockificationService {
|
|||||||
static final float THRESHOLD = 1f;
|
static final float THRESHOLD = 1f;
|
||||||
|
|
||||||
|
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
public ClassificationPage blockify(LayoutParsingType layoutParsingType,
|
||||||
|
List<TextPositionSequence> textPositions,
|
||||||
|
List<Cell> cells,
|
||||||
|
boolean xyOrder,
|
||||||
|
CleanRulings cleanRulings) {
|
||||||
|
|
||||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||||
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
||||||
@ -52,11 +56,18 @@ public class DocstrumBlockificationService {
|
|||||||
});
|
});
|
||||||
|
|
||||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||||
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
|
|
||||||
|
List<AbstractPageBlock> pageBlocks;
|
||||||
|
|
||||||
|
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||||
|
pageBlocks = toAbstractPageBlocks(zones, cleanRulings.getHorizontal(), cleanRulings.getVertical(), xyOrder);
|
||||||
|
} else {
|
||||||
|
pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
|
||||||
|
}
|
||||||
|
|
||||||
var classificationPage = new ClassificationPage(pageBlocks);
|
var classificationPage = new ClassificationPage(pageBlocks);
|
||||||
|
|
||||||
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
|
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, layoutParsingType == LayoutParsingType.CLARIFYND ? 10 : 0);
|
||||||
|
|
||||||
return classificationPage;
|
return classificationPage;
|
||||||
}
|
}
|
||||||
@ -223,7 +234,7 @@ public class DocstrumBlockificationService {
|
|||||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
AbstractPageBlock block = itty.next();
|
AbstractPageBlock block = itty.next();
|
||||||
if(block == null){
|
if (block == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (block instanceof TablePageBlock) {
|
if (block instanceof TablePageBlock) {
|
||||||
@ -234,7 +245,7 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
for (int i = 0; i < blocks.size(); i++) {
|
for (int i = 0; i < blocks.size(); i++) {
|
||||||
|
|
||||||
if(blocks.get(i) == null){
|
if (blocks.get(i) == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (blocks.get(i) == current) {
|
if (blocks.get(i) == current) {
|
||||||
@ -259,8 +270,8 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
var blocksIterator = blocks.iterator();
|
var blocksIterator = blocks.iterator();
|
||||||
while(blocksIterator.hasNext()){
|
while (blocksIterator.hasNext()) {
|
||||||
if(blocksIterator.next() == null){
|
if (blocksIterator.next() == null) {
|
||||||
blocksIterator.remove();
|
blocksIterator.remove();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/new/ScrambledTextAfterSorting.pdf";
|
String fileName = "files/WEF Global Risks Report 2017 - Part 1 (2).pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
@ -35,7 +35,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
|
Document document = buildGraph(fileName, LayoutParsingType.CLARIFYND);
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user