Refactoring to make downstream refactoring easier
This commit is contained in:
parent
d9a3bbbd30
commit
4a5464d6aa
@ -3,32 +3,50 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
|||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@ -39,14 +57,18 @@ public class LayoutParsingPipeline {
|
|||||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||||
private final PdfParsingService pdfParsingService;
|
|
||||||
private final SectionsBuilderService sectionsBuilderService;
|
private final SectionsBuilderService sectionsBuilderService;
|
||||||
private final SectionGridCreatorService sectionGridCreatorService;
|
private final SectionGridCreatorService sectionGridCreatorService;
|
||||||
private final TaasClassificationService taasClassificationService;
|
private final TaasClassificationService taasClassificationService;
|
||||||
private final RedactManagerClassificationService redactManagerClassificationService;
|
private final RedactManagerClassificationService redactManagerClassificationService;
|
||||||
private final DocuMineClassificationService docuMineClassificationService;
|
private final DocuMineClassificationService docuMineClassificationService;
|
||||||
private final SimplifiedSectionTextService simplifiedSectionTextService;
|
private final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||||
|
private final BodyTextFrameService bodyTextFrameService;
|
||||||
|
private final RulingCleaningService rulingCleaningService;
|
||||||
|
private final TableExtractionService tableExtractionService;
|
||||||
|
private final TaasBlockificationService taasBlockificationService;
|
||||||
|
private final DocuMineBlockificationService docuMineBlockificationService;
|
||||||
|
private final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||||
|
|
||||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||||
|
|
||||||
@ -63,7 +85,9 @@ public class LayoutParsingPipeline {
|
|||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||||
}
|
}
|
||||||
|
|
||||||
Document documentGraph = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
||||||
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||||
|
|
||||||
int numberOfPages = originDocument.getNumberOfPages();
|
int numberOfPages = originDocument.getNumberOfPages();
|
||||||
|
|
||||||
layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph));
|
layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph));
|
||||||
@ -88,15 +112,72 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Document parseLayout(LayoutParsingType layoutParsingType,
|
@SneakyThrows
|
||||||
|
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||||
PDDocument originDocument,
|
PDDocument originDocument,
|
||||||
ImageServiceResponse imageServiceResponse,
|
ImageServiceResponse imageServiceResponse,
|
||||||
TableServiceResponse tableServiceResponse) {
|
TableServiceResponse tableServiceResponse) {
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType,
|
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||||
originDocument,
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
|
||||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||||
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
|
|
||||||
|
originDocument.setAllSecurityToBeRemoved(true);
|
||||||
|
long pageCount = originDocument.getNumberOfPages();
|
||||||
|
|
||||||
|
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||||
|
classificationDocument.setPages(classificationPages);
|
||||||
|
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||||
|
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
||||||
|
stripper.setPageNumber(pageNumber);
|
||||||
|
stripper.setStartPage(pageNumber);
|
||||||
|
stripper.setEndPage(pageNumber);
|
||||||
|
stripper.setPdpage(pdPage);
|
||||||
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
|
||||||
|
stripper.setSortByPosition(true);
|
||||||
|
}
|
||||||
|
stripper.getText(originDocument);
|
||||||
|
|
||||||
|
PDRectangle pdr = pdPage.getMediaBox();
|
||||||
|
|
||||||
|
int rotation = pdPage.getRotation();
|
||||||
|
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||||
|
|
||||||
|
PDRectangle cropbox = pdPage.getCropBox();
|
||||||
|
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
||||||
|
stripper.getRulings(),
|
||||||
|
stripper.getMinCharWidth(),
|
||||||
|
stripper.getMaxCharHeight());
|
||||||
|
|
||||||
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
|
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
|
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
|
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
|
};
|
||||||
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
|
classificationPage.setRotation(rotation);
|
||||||
|
classificationPage.setLandscape(isLandscape);
|
||||||
|
classificationPage.setPageNumber(pageNumber);
|
||||||
|
classificationPage.setPageWidth(cropbox.getWidth());
|
||||||
|
classificationPage.setPageHeight(cropbox.getHeight());
|
||||||
|
|
||||||
|
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||||
|
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
||||||
|
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||||
|
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||||
|
}
|
||||||
|
|
||||||
|
tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType);
|
||||||
|
buildPageStatistics(classificationPage);
|
||||||
|
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||||
|
|
||||||
|
classificationPages.add(classificationPage);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||||
|
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||||
@ -107,40 +188,39 @@ public class LayoutParsingPipeline {
|
|||||||
sectionsBuilderService.buildSections(classificationDocument);
|
sectionsBuilderService.buildSections(classificationDocument);
|
||||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||||
|
|
||||||
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
return classificationDocument;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||||
|
|
||||||
public Document parseLayoutWithTimer(LayoutParsingType layoutParsingType,
|
if (!classificationPage.isLandscape()) {
|
||||||
PDDocument originDocument,
|
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||||
ImageServiceResponse imageServiceResponse,
|
|
||||||
TableServiceResponse tableServiceResponse) {
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType,
|
|
||||||
originDocument,
|
|
||||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
|
||||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
|
||||||
|
|
||||||
System.out.printf("parsed %d ms", System.currentTimeMillis() - start);
|
|
||||||
|
|
||||||
start = System.currentTimeMillis();
|
|
||||||
switch (layoutParsingType) {
|
|
||||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
|
||||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
|
||||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
|
||||||
}
|
}
|
||||||
System.out.printf(", classified %d ms", System.currentTimeMillis() - start);
|
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||||
|
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||||
start = System.currentTimeMillis();
|
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||||
sectionsBuilderService.buildSections(classificationDocument);
|
|
||||||
System.out.printf(", sections built %d ms", System.currentTimeMillis() - start);
|
|
||||||
|
|
||||||
start = System.currentTimeMillis();
|
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
|
||||||
System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start);
|
|
||||||
return document;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void buildPageStatistics(ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||||
|
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||||
|
if (textBlock instanceof TextPageBlock) {
|
||||||
|
if (((TextPageBlock) textBlock).getSequences() == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
||||||
|
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
||||||
|
classificationPage.getFontCounter().add(word.getFont());
|
||||||
|
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
||||||
|
classificationPage.getFontStyleCounter().add(word.getFontStyle());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import java.util.List;
|
|||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
@ -35,4 +36,6 @@ public class ClassificationPage {
|
|||||||
private float pageWidth;
|
private float pageWidth;
|
||||||
private float pageHeight;
|
private float pageHeight;
|
||||||
|
|
||||||
|
CleanRulings cleanRulings;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
|||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -17,5 +18,5 @@ public class PageContents {
|
|||||||
List<TextPositionSequence> sortedTextPositionSequences;
|
List<TextPositionSequence> sortedTextPositionSequences;
|
||||||
Rectangle2D cropBox;
|
Rectangle2D cropBox;
|
||||||
Rectangle2D mediaBox;
|
Rectangle2D mediaBox;
|
||||||
|
List<Ruling> rulings;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,6 +28,7 @@ public class Ruling extends Line2D.Float {
|
|||||||
super(p1, p2);
|
super(p1, p2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Ruling straightenVertical() {
|
public Ruling straightenVertical() {
|
||||||
|
|
||||||
double y1 = Math.min(getY1(), getY2());
|
double y1 = Math.min(getY1(), getY2());
|
||||||
@ -36,6 +37,7 @@ public class Ruling extends Line2D.Float {
|
|||||||
return new Ruling(new Point2D.Double(x, y1), new Point2D.Double(x, y2));
|
return new Ruling(new Point2D.Double(x, y1), new Point2D.Double(x, y2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Ruling straightenHorizontal() {
|
public Ruling straightenHorizontal() {
|
||||||
|
|
||||||
double x1 = Math.min(getX1(), getX2());
|
double x1 = Math.min(getX1(), getX2());
|
||||||
@ -444,6 +446,16 @@ public class Ruling extends Line2D.Float {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean almostMatches(Ruling ruling) {
|
||||||
|
|
||||||
|
final float TOLERANCE = 1;
|
||||||
|
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
|
||||||
|
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
|
||||||
|
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
|
||||||
|
Math.abs(ruling.getY2() - y2) < TOLERANCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private enum SOType {
|
private enum SOType {
|
||||||
VERTICAL,
|
VERTICAL,
|
||||||
HRIGHT,
|
HRIGHT,
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo
|
|||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
@ -20,6 +21,65 @@ public class BodyTextFrameService {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
|
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
||||||
|
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
|
||||||
|
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||||
|
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
private Rectangle calculateBodyTextFrameByRulings(List<ClassificationPage> pages) {
|
||||||
|
|
||||||
|
Map<ClassificationPage, List<Ruling>> potentialHeaderRulingsPerPage = new HashMap<>();
|
||||||
|
Map<ClassificationPage, List<Ruling>> potentialFooterRulingsPerPage = new HashMap<>();
|
||||||
|
|
||||||
|
for (var page : pages) {
|
||||||
|
potentialHeaderRulingsPerPage.put(page,
|
||||||
|
page.getCleanRulings()
|
||||||
|
.getHorizontal()
|
||||||
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getY1() > page.getPageHeight() * 0.8)
|
||||||
|
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
|
||||||
|
.toList());
|
||||||
|
potentialFooterRulingsPerPage.put(page,
|
||||||
|
page.getCleanRulings()
|
||||||
|
.getHorizontal()
|
||||||
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getY1() < page.getPageHeight() * 0.2)
|
||||||
|
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
Optional<Ruling> headerRuling = potentialHeaderRulingsPerPage.values()
|
||||||
|
.stream()
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.filter(ruling -> potentialHeaderRulingsPerPage.values()
|
||||||
|
.stream()
|
||||||
|
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
|
||||||
|
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
|
||||||
|
.min(Comparator.comparingDouble(Ruling::getY1));
|
||||||
|
|
||||||
|
Optional<Ruling> footerRuling = potentialFooterRulingsPerPage.values()
|
||||||
|
.stream()
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.filter(ruling -> potentialHeaderRulingsPerPage.values()
|
||||||
|
.stream()
|
||||||
|
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
|
||||||
|
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
|
||||||
|
.max(Comparator.comparingDouble(Ruling::getY1));
|
||||||
|
|
||||||
|
double maxY = headerRuling.isPresent() ? headerRuling.get().y1 : pages.stream().mapToDouble(ClassificationPage::getPageHeight).max().orElse(Double.MAX_VALUE);
|
||||||
|
double minY = footerRuling.map(ruling -> ruling.y1).orElse(0F);
|
||||||
|
double maxX = pages.stream().mapToDouble(ClassificationPage::getPageWidth).max().orElse(Double.MAX_VALUE);
|
||||||
|
|
||||||
|
return new Rectangle(new Point((float) maxX, (float) maxY), (float) 0, (float) minY, -1);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adjusts and sets the body text frame to a page.
|
* Adjusts and sets the body text frame to a page.
|
||||||
@ -34,7 +94,7 @@ public class BodyTextFrameService {
|
|||||||
* @param bodyTextFrame frame that contains the main text on portrait pages
|
* @param bodyTextFrame frame that contains the main text on portrait pages
|
||||||
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
|
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
|
||||||
*/
|
*/
|
||||||
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
private void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||||
|
|
||||||
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
||||||
|
|
||||||
@ -69,7 +129,10 @@ public class BodyTextFrameService {
|
|||||||
* @param landscape Calculate for landscape or portrait
|
* @param landscape Calculate for landscape or portrait
|
||||||
* @return Rectangle of the text frame
|
* @return Rectangle of the text frame
|
||||||
*/
|
*/
|
||||||
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape, LayoutParsingType layoutParsingType) {
|
private Rectangle calculateBodyTextFrame(List<ClassificationPage> pages,
|
||||||
|
FloatFrequencyCounter documentFontSizeCounter,
|
||||||
|
boolean landscape,
|
||||||
|
LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
float approximateHeaderLineCount;
|
float approximateHeaderLineCount;
|
||||||
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
|
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
|
||||||
@ -95,8 +158,8 @@ public class BodyTextFrameService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals(
|
||||||
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount){
|
LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -21,9 +21,9 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTrans
|
|||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class TextPositionSequenceSorter {
|
public class PageContentExtractor {
|
||||||
|
|
||||||
public List<PageContents> getSortedTextPositionsWithPages(String filename) throws IOException {
|
public List<PageContents> getSortedPageContents(String filename) throws IOException {
|
||||||
|
|
||||||
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
|
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
|
||||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||||
@ -49,7 +49,8 @@ public class TextPositionSequenceSorter {
|
|||||||
|
|
||||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox())));
|
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
|
||||||
|
stripper.getRulings()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1,154 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class PdfParsingService {
|
|
||||||
|
|
||||||
private final RulingCleaningService rulingCleaningService;
|
|
||||||
private final TableExtractionService tableExtractionService;
|
|
||||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
|
||||||
private final TaasBlockificationService taasBlockificationService;
|
|
||||||
private final DocuMineBlockificationService docuMineBlockificationService;
|
|
||||||
private final RedactManagerBlockificationService redactManagerBlockificationService;
|
|
||||||
|
|
||||||
|
|
||||||
public ClassificationDocument parseDocument(LayoutParsingType layoutParsingType,
|
|
||||||
PDDocument originDocument,
|
|
||||||
Map<Integer, List<TableCells>> pdfTableCells,
|
|
||||||
Map<Integer, List<ClassifiedImage>> pdfImages) {
|
|
||||||
|
|
||||||
ClassificationDocument document = new ClassificationDocument();
|
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
|
||||||
|
|
||||||
originDocument.setAllSecurityToBeRemoved(true);
|
|
||||||
long pageCount = originDocument.getNumberOfPages();
|
|
||||||
|
|
||||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
|
||||||
parsePage(layoutParsingType, pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
|
|
||||||
}
|
|
||||||
|
|
||||||
document.setPages(classificationPages);
|
|
||||||
|
|
||||||
return document;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void parsePage(LayoutParsingType layoutParsingType,
|
|
||||||
Map<Integer, List<ClassifiedImage>> pdfImages,
|
|
||||||
PDDocument pdDocument,
|
|
||||||
Map<Integer, List<TableCells>> pdfTableCells,
|
|
||||||
ClassificationDocument document,
|
|
||||||
List<ClassificationPage> classificationPages,
|
|
||||||
int pageNumber) {
|
|
||||||
|
|
||||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
|
||||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
|
||||||
stripper.setPageNumber(pageNumber);
|
|
||||||
stripper.setStartPage(pageNumber);
|
|
||||||
stripper.setEndPage(pageNumber);
|
|
||||||
stripper.setPdpage(pdPage);
|
|
||||||
if(layoutParsingType.equals(LayoutParsingType.DOCUMINE)){
|
|
||||||
stripper.setSortByPosition(true);
|
|
||||||
}
|
|
||||||
stripper.getText(pdDocument);
|
|
||||||
|
|
||||||
PDRectangle pdr = pdPage.getMediaBox();
|
|
||||||
|
|
||||||
int rotation = pdPage.getRotation();
|
|
||||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
|
||||||
|
|
||||||
PDRectangle cropbox = pdPage.getCropBox();
|
|
||||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
|
||||||
stripper.getRulings(),
|
|
||||||
stripper.getMinCharWidth(),
|
|
||||||
stripper.getMaxCharHeight());
|
|
||||||
|
|
||||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
|
||||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
|
||||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
|
||||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
|
||||||
};
|
|
||||||
|
|
||||||
classificationPage.setRotation(rotation);
|
|
||||||
classificationPage.setLandscape(isLandscape);
|
|
||||||
classificationPage.setPageNumber(pageNumber);
|
|
||||||
classificationPage.setPageWidth(cropbox.getWidth());
|
|
||||||
classificationPage.setPageHeight(cropbox.getHeight());
|
|
||||||
|
|
||||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
|
||||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
|
||||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
|
||||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
|
||||||
}
|
|
||||||
|
|
||||||
tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType);
|
|
||||||
buildPageStatistics(classificationPage);
|
|
||||||
increaseDocumentStatistics(classificationPage, document);
|
|
||||||
|
|
||||||
classificationPages.add(classificationPage);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
|
||||||
|
|
||||||
if (!classificationPage.isLandscape()) {
|
|
||||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
|
||||||
}
|
|
||||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
|
||||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
|
||||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void buildPageStatistics(ClassificationPage classificationPage) {
|
|
||||||
|
|
||||||
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
|
||||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
|
||||||
if (textBlock instanceof TextPageBlock) {
|
|
||||||
if (((TextPageBlock) textBlock).getSequences() == null) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
|
||||||
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
|
||||||
classificationPage.getFontCounter().add(word.getFont());
|
|
||||||
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
|
||||||
classificationPage.getFontStyleCounter().add(word.getFontStyle());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@ -7,14 +7,11 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -25,7 +22,6 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class DocuMineClassificationService {
|
public class DocuMineClassificationService {
|
||||||
|
|
||||||
private final BodyTextFrameService bodyTextFrameService;
|
|
||||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
@ -33,14 +29,11 @@ public class DocuMineClassificationService {
|
|||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.DOCUMINE);
|
|
||||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.DOCUMINE);
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
|
||||||
classifyPage(page, document, headlineFontSizes);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,14 +5,11 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -23,19 +20,14 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class RedactManagerClassificationService {
|
public class RedactManagerClassificationService {
|
||||||
|
|
||||||
private final BodyTextFrameService bodyTextFrameService;
|
|
||||||
|
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.REDACT_MANAGER);
|
|
||||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.REDACT_MANAGER);
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
|
||||||
classifyPage(page, document, headlineFontSizes);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,8 +5,6 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
@ -28,14 +26,13 @@ public class TaasClassificationService {
|
|||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.TAAS);
|
|
||||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.TAAS);
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
|
||||||
classifyPage(page, document, headlineFontSizes);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -152,7 +152,7 @@ public class PdfVisualisationUtility {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<Line2D> line2DS, Options options) {
|
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<? extends Line2D> line2DS, Options options) {
|
||||||
|
|
||||||
var pdPage = pdDocument.getPage(pageNumber - 1);
|
var pdPage = pdDocument.getPage(pageNumber - 1);
|
||||||
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||||
|
|||||||
@ -22,10 +22,11 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
@ -46,7 +47,7 @@ public class BdrJsonBuildTest extends AbstractTest {
|
|||||||
|
|
||||||
try (InputStream inputStream = new FileInputStream(filename)) {
|
try (InputStream inputStream = new FileInputStream(filename)) {
|
||||||
try (PDDocument pdDocument = Loader.loadPDF(inputStream)) {
|
try (PDDocument pdDocument = Loader.loadPDF(inputStream)) {
|
||||||
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
|
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -34,6 +34,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -94,10 +95,10 @@ public class HeadlinesGoldStandardIntegrationTest {
|
|||||||
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
||||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||||
|
|
||||||
Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(pdfFileResource.getInputStream()),
|
Loader.loadPDF(pdfFileResource.getInputStream()),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse());
|
new TableServiceResponse()));
|
||||||
|
|
||||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||||
.map(SemanticNode::getHeadline)
|
.map(SemanticNode::getHeadline)
|
||||||
|
|||||||
@ -15,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -29,7 +30,7 @@ public class BuildDocumentGraphTest extends AbstractTest {
|
|||||||
@Disabled
|
@Disabled
|
||||||
public void buildMetolachlor() {
|
public void buildMetolachlor() {
|
||||||
|
|
||||||
Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
|
Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||||
assertEquals(221, documentGraph.getPages().size());
|
assertEquals(221, documentGraph.getPages().size());
|
||||||
assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count());
|
assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count());
|
||||||
assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count());
|
assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count());
|
||||||
@ -47,10 +48,10 @@ public class BuildDocumentGraphTest extends AbstractTest {
|
|||||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||||
|
|
||||||
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) {
|
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) {
|
||||||
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||||
pdDocument,
|
pdDocument,
|
||||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||||
new TableServiceResponse());
|
new TableServiceResponse()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -54,10 +55,10 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void writeJsons(Path filename) {
|
private void writeJsons(Path filename) {
|
||||||
|
|
||||||
Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(new FileInputStream(filename.toFile())),
|
Loader.loadPDF(new FileInputStream(filename.toFile())),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse());
|
new TableServiceResponse()));
|
||||||
|
|
||||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
|
|||||||
import java.awt.Color;
|
import java.awt.Color;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
@ -23,7 +24,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
|||||||
@Disabled
|
@Disabled
|
||||||
public void visualizeMetolachlor() {
|
public void visualizeMetolachlor() {
|
||||||
|
|
||||||
String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06";
|
String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||||
visualizePdf(filename);
|
visualizePdf(filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -33,7 +34,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
|||||||
@Disabled
|
@Disabled
|
||||||
public void visualizeRotatedTestDocument() {
|
public void visualizeRotatedTestDocument() {
|
||||||
|
|
||||||
String filename = "files/211";
|
String filename = "files/new/RotateTestFile.pdf";
|
||||||
visualizePdf(filename);
|
visualizePdf(filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,7 +44,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
|||||||
@Disabled
|
@Disabled
|
||||||
public void visualizeCraftedDocument() {
|
public void visualizeCraftedDocument() {
|
||||||
|
|
||||||
String filename = "files/crafted document";
|
String filename = "files/crafted document.pdf";
|
||||||
visualizePdf(filename);
|
visualizePdf(filename);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -60,8 +61,8 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
|||||||
|
|
||||||
private static void visualizeSemanticNodes(String filename, Document documentGraph, TextBlock textBlock) throws IOException {
|
private static void visualizeSemanticNodes(String filename, Document documentGraph, TextBlock textBlock) throws IOException {
|
||||||
|
|
||||||
File tmpFile = File.createTempFile(filename, "SEMANTIC_NODES_BBOX.pdf");
|
File tmpFile = new File("/tmp/" + Path.of(filename).getFileName().toString() + "_SEMANTIC_NODES_BBOX.pdf");
|
||||||
ClassPathResource fileResource = new ClassPathResource(filename + ".pdf");
|
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||||
|
|
||||||
try (var fileStream = fileResource.getInputStream();//
|
try (var fileStream = fileResource.getInputStream();//
|
||||||
PDDocument pdDocument = Loader.loadPDF(fileStream)//
|
PDDocument pdDocument = Loader.loadPDF(fileStream)//
|
||||||
|
|||||||
@ -21,16 +21,16 @@ import org.springframework.core.io.ClassPathResource;
|
|||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||||
@ -40,7 +40,7 @@ import lombok.SneakyThrows;
|
|||||||
public class PdfSegmentationServiceTest extends AbstractTest {
|
public class PdfSegmentationServiceTest extends AbstractTest {
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
private PdfParsingService pdfParsingService;
|
private LayoutParsingPipeline layoutParsingPipeline;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
private ObjectMapper objectMapper;
|
private ObjectMapper objectMapper;
|
||||||
@ -57,12 +57,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
@Autowired
|
@Autowired
|
||||||
private SectionsBuilderService sectionsBuilderService;
|
private SectionsBuilderService sectionsBuilderService;
|
||||||
|
|
||||||
|
|
||||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(LayoutParsingType.REDACT_MANAGER,
|
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
originDocument,
|
originDocument,
|
||||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(new TableServiceResponse()),
|
new ImageServiceResponse(),
|
||||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(new ImageServiceResponse()));
|
new TableServiceResponse());
|
||||||
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.DividingCol
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -30,7 +30,7 @@ class GapAcrossLinesDetectionServiceTest {
|
|||||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||||
System.out.println("start TextPosition extraction");
|
System.out.println("start TextPosition extraction");
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||||
System.out.println("start column detection");
|
System.out.println("start column detection");
|
||||||
@ -56,7 +56,7 @@ class GapAcrossLinesDetectionServiceTest {
|
|||||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||||
System.out.println("start TextPosition extraction");
|
System.out.println("start TextPosition extraction");
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
|
List<PageContents> sortedTextPositionSequencesPerPage = PageContentExtractor.getSortedPageContents(filename);
|
||||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||||
System.out.println("start column detection");
|
System.out.println("start column detection");
|
||||||
|
|||||||
@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.PageInformatio
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
@ -28,7 +28,7 @@ class InvisibleTableDetectionServiceTest {
|
|||||||
|
|
||||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||||
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
||||||
|
|
||||||
int pageNumber = 1;
|
int pageNumber = 1;
|
||||||
Rectangle2D tableBBox = pageContents.get(0)
|
Rectangle2D tableBBox = pageContents.get(0)
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import org.junit.jupiter.api.Disabled;
|
|||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
@ -20,7 +20,7 @@ class MainBodyTextFrameExtractionServiceTest {
|
|||||||
|
|
||||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||||
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
|
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
|
||||||
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,12 +9,12 @@ import org.junit.jupiter.api.Test;
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
class TextPositionSequenceSorterTest {
|
class PageContentExtractorTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
@ -24,7 +24,7 @@ class TextPositionSequenceSorterTest {
|
|||||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||||
|
|
||||||
List<PageContents> textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
|
||||||
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
|
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
|
||||||
textPositionPerPage.stream()
|
textPositionPerPage.stream()
|
||||||
@ -8,7 +8,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -24,7 +24,7 @@ class PageInformationServiceTest {
|
|||||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||||
System.out.println("start TextPosition extraction");
|
System.out.println("start TextPosition extraction");
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||||
System.out.println("start gap detection");
|
System.out.println("start gap detection");
|
||||||
start = System.currentTimeMillis();
|
start = System.currentTimeMillis();
|
||||||
@ -47,7 +47,7 @@ class PageInformationServiceTest {
|
|||||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||||
System.out.println("start TextPosition extraction");
|
System.out.println("start TextPosition extraction");
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||||
System.out.println("start gap detection");
|
System.out.println("start gap detection");
|
||||||
start = System.currentTimeMillis();
|
start = System.currentTimeMillis();
|
||||||
|
|||||||
@ -0,0 +1,38 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
public class RulingCleaningServiceTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Disabled
|
||||||
|
@SneakyThrows
|
||||||
|
public void textRulingExtraction() {
|
||||||
|
|
||||||
|
String fileName = "files/BASF/2013-1110704.pdf";
|
||||||
|
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
||||||
|
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents("files/BASF/2013-1110704.pdf");
|
||||||
|
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
||||||
|
|
||||||
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
|
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||||
|
for (PageContents pageContent : pageContents) {
|
||||||
|
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -24,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
@ -206,6 +207,21 @@ public class PdfDraw {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
|
||||||
|
|
||||||
|
try (InputStream inputStream = new ClassPathResource(filename).getInputStream();//
|
||||||
|
PDDocument pdDocument = Loader.loadPDF(inputStream);//
|
||||||
|
var out = new FileOutputStream(tmpFileName)//
|
||||||
|
) {
|
||||||
|
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||||
|
PdfVisualisationUtility.drawLine2DList(pdDocument, pageNumber, linesPerPage.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
|
||||||
|
}
|
||||||
|
pdDocument.save(out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Builder
|
@Builder
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@Getter
|
@Getter
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user