claryfind
This commit is contained in:
parent
9bd8419770
commit
f9f8d9cf2f
@ -260,8 +260,10 @@ public class LayoutParsingPipeline {
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(layoutParsingType, stripper.getTextPositionSequences(), emptyTableCells, true, cleanRulings);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(layoutParsingType, stripper.getTextPositionSequences(), emptyTableCells, false, cleanRulings);
|
||||
};
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
|
||||
@ -5,13 +5,12 @@ import static java.util.stream.Collectors.toSet;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
@ -19,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
@ -38,7 +38,11 @@ public class DocstrumBlockificationService {
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
||||
public ClassificationPage blockify(LayoutParsingType layoutParsingType,
|
||||
List<TextPositionSequence> textPositions,
|
||||
List<Cell> cells,
|
||||
boolean xyOrder,
|
||||
CleanRulings cleanRulings) {
|
||||
|
||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
||||
@ -52,11 +56,18 @@ public class DocstrumBlockificationService {
|
||||
});
|
||||
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
|
||||
|
||||
List<AbstractPageBlock> pageBlocks;
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
pageBlocks = toAbstractPageBlocks(zones, cleanRulings.getHorizontal(), cleanRulings.getVertical(), xyOrder);
|
||||
} else {
|
||||
pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
|
||||
}
|
||||
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
|
||||
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
|
||||
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, layoutParsingType == LayoutParsingType.CLARIFYND ? 10 : 0);
|
||||
|
||||
return classificationPage;
|
||||
}
|
||||
@ -223,7 +234,7 @@ public class DocstrumBlockificationService {
|
||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
if(block == null){
|
||||
if (block == null) {
|
||||
continue;
|
||||
}
|
||||
if (block instanceof TablePageBlock) {
|
||||
@ -234,7 +245,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
|
||||
if(blocks.get(i) == null){
|
||||
if (blocks.get(i) == null) {
|
||||
continue;
|
||||
}
|
||||
if (blocks.get(i) == current) {
|
||||
@ -259,8 +270,8 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
}
|
||||
var blocksIterator = blocks.iterator();
|
||||
while(blocksIterator.hasNext()){
|
||||
if(blocksIterator.next() == null){
|
||||
while (blocksIterator.hasNext()) {
|
||||
if (blocksIterator.next() == null) {
|
||||
blocksIterator.remove();
|
||||
}
|
||||
}
|
||||
|
||||
@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/new/ScrambledTextAfterSorting.pdf";
|
||||
String fileName = "files/WEF Global Risks Report 2017 - Part 1 (2).pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
@ -35,7 +35,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.CLARIFYND);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user