Refactoring to make downstream refactoring easier
This commit is contained in:
parent
d9a3bbbd30
commit
4a5464d6aa
@ -3,32 +3,50 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@ -39,14 +57,18 @@ public class LayoutParsingPipeline {
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||
private final PdfParsingService pdfParsingService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final SectionGridCreatorService sectionGridCreatorService;
|
||||
private final TaasClassificationService taasClassificationService;
|
||||
private final RedactManagerClassificationService redactManagerClassificationService;
|
||||
private final DocuMineClassificationService docuMineClassificationService;
|
||||
private final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
private final RulingCleaningService rulingCleaningService;
|
||||
private final TableExtractionService tableExtractionService;
|
||||
private final TaasBlockificationService taasBlockificationService;
|
||||
private final DocuMineBlockificationService docuMineBlockificationService;
|
||||
private final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
@ -63,7 +85,9 @@ public class LayoutParsingPipeline {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
}
|
||||
|
||||
Document documentGraph = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
|
||||
int numberOfPages = originDocument.getNumberOfPages();
|
||||
|
||||
layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph));
|
||||
@ -88,15 +112,72 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
|
||||
public Document parseLayout(LayoutParsingType layoutParsingType,
|
||||
@SneakyThrows
|
||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType,
|
||||
originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
originDocument.setAllSecurityToBeRemoved(true);
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
classificationDocument.setPages(classificationPages);
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
|
||||
stripper.setSortByPosition(true);
|
||||
}
|
||||
stripper.getText(originDocument);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
||||
stripper.getRulings(),
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
};
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType);
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||
|
||||
classificationPages.add(classificationPage);
|
||||
}
|
||||
|
||||
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
@ -107,40 +188,39 @@ public class LayoutParsingPipeline {
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
public Document parseLayoutWithTimer(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType,
|
||||
originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||
|
||||
System.out.printf("parsed %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
switch (layoutParsingType) {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
System.out.printf(", classified %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
System.out.printf(", sections built %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start);
|
||||
return document;
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||
}
|
||||
|
||||
|
||||
private void buildPageStatistics(ClassificationPage classificationPage) {
|
||||
|
||||
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
if (((TextPageBlock) textBlock).getSequences() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
||||
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
||||
classificationPage.getFontCounter().add(word.getFont());
|
||||
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
||||
classificationPage.getFontStyleCounter().add(word.getFontStyle());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -5,6 +5,7 @@ import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
@ -35,4 +36,6 @@ public class ClassificationPage {
|
||||
private float pageWidth;
|
||||
private float pageHeight;
|
||||
|
||||
CleanRulings cleanRulings;
|
||||
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -17,5 +18,5 @@ public class PageContents {
|
||||
List<TextPositionSequence> sortedTextPositionSequences;
|
||||
Rectangle2D cropBox;
|
||||
Rectangle2D mediaBox;
|
||||
|
||||
List<Ruling> rulings;
|
||||
}
|
||||
|
||||
@ -28,6 +28,7 @@ public class Ruling extends Line2D.Float {
|
||||
super(p1, p2);
|
||||
}
|
||||
|
||||
|
||||
public Ruling straightenVertical() {
|
||||
|
||||
double y1 = Math.min(getY1(), getY2());
|
||||
@ -36,6 +37,7 @@ public class Ruling extends Line2D.Float {
|
||||
return new Ruling(new Point2D.Double(x, y1), new Point2D.Double(x, y2));
|
||||
}
|
||||
|
||||
|
||||
public Ruling straightenHorizontal() {
|
||||
|
||||
double x1 = Math.min(getX1(), getX2());
|
||||
@ -444,6 +446,16 @@ public class Ruling extends Line2D.Float {
|
||||
}
|
||||
|
||||
|
||||
public boolean almostMatches(Ruling ruling) {
|
||||
|
||||
final float TOLERANCE = 1;
|
||||
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
|
||||
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
|
||||
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
|
||||
Math.abs(ruling.getY2() - y2) < TOLERANCE;
|
||||
}
|
||||
|
||||
|
||||
private enum SOType {
|
||||
VERTICAL,
|
||||
HRIGHT,
|
||||
|
||||
@ -8,6 +8,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
@ -20,6 +21,65 @@ public class BodyTextFrameService {
|
||||
|
||||
|
||||
|
||||
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
|
||||
|
||||
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
||||
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
private Rectangle calculateBodyTextFrameByRulings(List<ClassificationPage> pages) {
|
||||
|
||||
Map<ClassificationPage, List<Ruling>> potentialHeaderRulingsPerPage = new HashMap<>();
|
||||
Map<ClassificationPage, List<Ruling>> potentialFooterRulingsPerPage = new HashMap<>();
|
||||
|
||||
for (var page : pages) {
|
||||
potentialHeaderRulingsPerPage.put(page,
|
||||
page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getY1() > page.getPageHeight() * 0.8)
|
||||
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
|
||||
.toList());
|
||||
potentialFooterRulingsPerPage.put(page,
|
||||
page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getY1() < page.getPageHeight() * 0.2)
|
||||
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
|
||||
.toList());
|
||||
}
|
||||
|
||||
Optional<Ruling> headerRuling = potentialHeaderRulingsPerPage.values()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.filter(ruling -> potentialHeaderRulingsPerPage.values()
|
||||
.stream()
|
||||
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
|
||||
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
|
||||
.min(Comparator.comparingDouble(Ruling::getY1));
|
||||
|
||||
Optional<Ruling> footerRuling = potentialFooterRulingsPerPage.values()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.filter(ruling -> potentialHeaderRulingsPerPage.values()
|
||||
.stream()
|
||||
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
|
||||
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
|
||||
.max(Comparator.comparingDouble(Ruling::getY1));
|
||||
|
||||
double maxY = headerRuling.isPresent() ? headerRuling.get().y1 : pages.stream().mapToDouble(ClassificationPage::getPageHeight).max().orElse(Double.MAX_VALUE);
|
||||
double minY = footerRuling.map(ruling -> ruling.y1).orElse(0F);
|
||||
double maxX = pages.stream().mapToDouble(ClassificationPage::getPageWidth).max().orElse(Double.MAX_VALUE);
|
||||
|
||||
return new Rectangle(new Point((float) maxX, (float) maxY), (float) 0, (float) minY, -1);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Adjusts and sets the body text frame to a page.
|
||||
@ -34,7 +94,7 @@ public class BodyTextFrameService {
|
||||
* @param bodyTextFrame frame that contains the main text on portrait pages
|
||||
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
|
||||
*/
|
||||
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||
private void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||
|
||||
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
||||
|
||||
@ -69,7 +129,10 @@ public class BodyTextFrameService {
|
||||
* @param landscape Calculate for landscape or portrait
|
||||
* @return Rectangle of the text frame
|
||||
*/
|
||||
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape, LayoutParsingType layoutParsingType) {
|
||||
private Rectangle calculateBodyTextFrame(List<ClassificationPage> pages,
|
||||
FloatFrequencyCounter documentFontSizeCounter,
|
||||
boolean landscape,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
float approximateHeaderLineCount;
|
||||
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
|
||||
@ -95,8 +158,8 @@ public class BodyTextFrameService {
|
||||
}
|
||||
|
||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)
|
||||
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount){
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals(
|
||||
LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -21,9 +21,9 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTrans
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TextPositionSequenceSorter {
|
||||
public class PageContentExtractor {
|
||||
|
||||
public List<PageContents> getSortedTextPositionsWithPages(String filename) throws IOException {
|
||||
public List<PageContents> getSortedPageContents(String filename) throws IOException {
|
||||
|
||||
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
@ -49,7 +49,8 @@ public class TextPositionSequenceSorter {
|
||||
|
||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox())));
|
||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
|
||||
stripper.getRulings()));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,154 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class PdfParsingService {
|
||||
|
||||
private final RulingCleaningService rulingCleaningService;
|
||||
private final TableExtractionService tableExtractionService;
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final TaasBlockificationService taasBlockificationService;
|
||||
private final DocuMineBlockificationService docuMineBlockificationService;
|
||||
private final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
|
||||
|
||||
public ClassificationDocument parseDocument(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
Map<Integer, List<TableCells>> pdfTableCells,
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages) {
|
||||
|
||||
ClassificationDocument document = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
originDocument.setAllSecurityToBeRemoved(true);
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
parsePage(layoutParsingType, pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
|
||||
}
|
||||
|
||||
document.setPages(classificationPages);
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void parsePage(LayoutParsingType layoutParsingType,
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages,
|
||||
PDDocument pdDocument,
|
||||
Map<Integer, List<TableCells>> pdfTableCells,
|
||||
ClassificationDocument document,
|
||||
List<ClassificationPage> classificationPages,
|
||||
int pageNumber) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
if(layoutParsingType.equals(LayoutParsingType.DOCUMINE)){
|
||||
stripper.setSortByPosition(true);
|
||||
}
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
||||
stripper.getRulings(),
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
};
|
||||
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType);
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, document);
|
||||
|
||||
classificationPages.add(classificationPage);
|
||||
}
|
||||
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||
}
|
||||
|
||||
|
||||
private void buildPageStatistics(ClassificationPage classificationPage) {
|
||||
|
||||
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
if (((TextPageBlock) textBlock).getSequences() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
||||
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
||||
classificationPage.getFontCounter().add(word.getFont());
|
||||
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
||||
classificationPage.getFontStyleCounter().add(word.getFontStyle());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -7,14 +7,11 @@ import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -25,7 +22,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class DocuMineClassificationService {
|
||||
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
@ -33,14 +29,11 @@ public class DocuMineClassificationService {
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.DOCUMINE);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.DOCUMINE);
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,14 +5,11 @@ import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -23,19 +20,14 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RedactManagerClassificationService {
|
||||
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.REDACT_MANAGER);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.REDACT_MANAGER);
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,8 +5,6 @@ import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
@ -28,14 +26,13 @@ public class TaasClassificationService {
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.TAAS);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.TAAS);
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
@ -152,7 +152,7 @@ public class PdfVisualisationUtility {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<Line2D> line2DS, Options options) {
|
||||
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<? extends Line2D> line2DS, Options options) {
|
||||
|
||||
var pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
@ -22,10 +22,11 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
@ -46,7 +47,7 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
|
||||
try (InputStream inputStream = new FileInputStream(filename)) {
|
||||
try (PDDocument pdDocument = Loader.loadPDF(inputStream)) {
|
||||
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -34,6 +34,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -94,10 +95,10 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||
|
||||
Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(pdfFileResource.getInputStream()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
new TableServiceResponse()));
|
||||
|
||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||
.map(SemanticNode::getHeadline)
|
||||
|
||||
@ -15,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -29,7 +30,7 @@ public class BuildDocumentGraphTest extends AbstractTest {
|
||||
@Disabled
|
||||
public void buildMetolachlor() {
|
||||
|
||||
Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
|
||||
Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
assertEquals(221, documentGraph.getPages().size());
|
||||
assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count());
|
||||
assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count());
|
||||
@ -47,10 +48,10 @@ public class BuildDocumentGraphTest extends AbstractTest {
|
||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||
|
||||
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) {
|
||||
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
pdDocument,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse());
|
||||
new TableServiceResponse()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -54,10 +55,10 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(new FileInputStream(filename.toFile())),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
new TableServiceResponse()));
|
||||
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
import java.awt.Color;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
@ -23,7 +24,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
||||
@Disabled
|
||||
public void visualizeMetolachlor() {
|
||||
|
||||
String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06";
|
||||
String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
visualizePdf(filename);
|
||||
}
|
||||
|
||||
@ -33,7 +34,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
||||
@Disabled
|
||||
public void visualizeRotatedTestDocument() {
|
||||
|
||||
String filename = "files/211";
|
||||
String filename = "files/new/RotateTestFile.pdf";
|
||||
visualizePdf(filename);
|
||||
}
|
||||
|
||||
@ -43,7 +44,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
||||
@Disabled
|
||||
public void visualizeCraftedDocument() {
|
||||
|
||||
String filename = "files/crafted document";
|
||||
String filename = "files/crafted document.pdf";
|
||||
visualizePdf(filename);
|
||||
}
|
||||
|
||||
@ -60,8 +61,8 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
||||
|
||||
private static void visualizeSemanticNodes(String filename, Document documentGraph, TextBlock textBlock) throws IOException {
|
||||
|
||||
File tmpFile = File.createTempFile(filename, "SEMANTIC_NODES_BBOX.pdf");
|
||||
ClassPathResource fileResource = new ClassPathResource(filename + ".pdf");
|
||||
File tmpFile = new File("/tmp/" + Path.of(filename).getFileName().toString() + "_SEMANTIC_NODES_BBOX.pdf");
|
||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||
|
||||
try (var fileStream = fileResource.getInputStream();//
|
||||
PDDocument pdDocument = Loader.loadPDF(fileStream)//
|
||||
|
||||
@ -21,16 +21,16 @@ import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
@ -40,7 +40,7 @@ import lombok.SneakyThrows;
|
||||
public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
private PdfParsingService pdfParsingService;
|
||||
private LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
@Autowired
|
||||
private ObjectMapper objectMapper;
|
||||
@ -57,12 +57,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Autowired
|
||||
private SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(LayoutParsingType.REDACT_MANAGER,
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(new TableServiceResponse()),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(new ImageServiceResponse()));
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.DividingCol
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -30,7 +30,7 @@ class GapAcrossLinesDetectionServiceTest {
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
||||
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start column detection");
|
||||
@ -56,7 +56,7 @@ class GapAcrossLinesDetectionServiceTest {
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
|
||||
List<PageContents> sortedTextPositionSequencesPerPage = PageContentExtractor.getSortedPageContents(filename);
|
||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start column detection");
|
||||
|
||||
@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.PageInformatio
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
@ -28,7 +28,7 @@ class InvisibleTableDetectionServiceTest {
|
||||
|
||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
||||
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
||||
|
||||
int pageNumber = 1;
|
||||
Rectangle2D tableBBox = pageContents.get(0)
|
||||
|
||||
@ -7,7 +7,7 @@ import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -20,7 +20,7 @@ class MainBodyTextFrameExtractionServiceTest {
|
||||
|
||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
|
||||
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
||||
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getSortedPageContents(fileName);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -9,12 +9,12 @@ import org.junit.jupiter.api.Test;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class TextPositionSequenceSorterTest {
|
||||
class PageContentExtractorTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@ -24,7 +24,7 @@ class TextPositionSequenceSorterTest {
|
||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||
|
||||
List<PageContents> textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
|
||||
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
|
||||
textPositionPerPage.stream()
|
||||
@ -8,7 +8,7 @@ import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -24,7 +24,7 @@ class PageInformationServiceTest {
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
||||
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start gap detection");
|
||||
start = System.currentTimeMillis();
|
||||
@ -47,7 +47,7 @@ class PageInformationServiceTest {
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
||||
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start gap detection");
|
||||
start = System.currentTimeMillis();
|
||||
|
||||
@ -0,0 +1,38 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class RulingCleaningServiceTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void textRulingExtraction() {
|
||||
|
||||
String fileName = "files/BASF/2013-1110704.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents("files/BASF/2013-1110704.pdf");
|
||||
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
||||
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -24,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
@ -206,6 +207,21 @@ public class PdfDraw {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
|
||||
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream();//
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);//
|
||||
var out = new FileOutputStream(tmpFileName)//
|
||||
) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawLine2DList(pdDocument, pageNumber, linesPerPage.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user