claryfind

This commit is contained in:
Dominique Eifländer 2024-04-16 12:02:22 +02:00
parent 9bd8419770
commit f9f8d9cf2f
4 changed files with 26 additions and 13 deletions

View File

@ -260,8 +260,10 @@ public class LayoutParsingPipeline {
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(layoutParsingType, stripper.getTextPositionSequences(), emptyTableCells, true, cleanRulings);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(layoutParsingType, stripper.getTextPositionSequences(), emptyTableCells, false, cleanRulings);
};
classificationPage.setCleanRulings(cleanRulings);

View File

@ -5,13 +5,12 @@ import static java.util.stream.Collectors.toSet;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
@ -19,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
@ -38,7 +38,11 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
public ClassificationPage blockify(LayoutParsingType layoutParsingType,
List<TextPositionSequence> textPositions,
List<Cell> cells,
boolean xyOrder,
CleanRulings cleanRulings) {
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
List<Ruling> usedHorizonalRulings = new ArrayList<>();
@ -52,11 +56,18 @@ public class DocstrumBlockificationService {
});
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
List<AbstractPageBlock> pageBlocks;
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
pageBlocks = toAbstractPageBlocks(zones, cleanRulings.getHorizontal(), cleanRulings.getVertical(), xyOrder);
} else {
pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
}
var classificationPage = new ClassificationPage(pageBlocks);
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, layoutParsingType == LayoutParsingType.CLARIFYND ? 10 : 0);
return classificationPage;
}
@ -223,7 +234,7 @@ public class DocstrumBlockificationService {
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if(block == null){
if (block == null) {
continue;
}
if (block instanceof TablePageBlock) {
@ -234,7 +245,7 @@ public class DocstrumBlockificationService {
for (int i = 0; i < blocks.size(); i++) {
if(blocks.get(i) == null){
if (blocks.get(i) == null) {
continue;
}
if (blocks.get(i) == current) {
@ -259,8 +270,8 @@ public class DocstrumBlockificationService {
}
}
var blocksIterator = blocks.iterator();
while(blocksIterator.hasNext()){
if(blocksIterator.next() == null){
while (blocksIterator.hasNext()) {
if (blocksIterator.next() == null) {
blocksIterator.remove();
}
}

View File

@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/new/ScrambledTextAfterSorting.pdf";
String fileName = "files/WEF Global Risks Report 2017 - Part 1 (2).pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
@ -35,7 +35,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
Document document = buildGraph(fileName, LayoutParsingType.CLARIFYND);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}