Refactoring to make downstream refactoring easier

This commit is contained in:
Kilian Schuettler 2023-08-04 15:16:36 +02:00
parent d9a3bbbd30
commit 4a5464d6aa
25 changed files with 315 additions and 267 deletions

View File

@ -3,32 +3,50 @@ package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ -39,14 +57,18 @@ public class LayoutParsingPipeline {
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
private final CvTableParsingAdapter cvTableParsingAdapter;
private final LayoutParsingStorageService layoutParsingStorageService;
private final PdfParsingService pdfParsingService;
private final SectionsBuilderService sectionsBuilderService;
private final SectionGridCreatorService sectionGridCreatorService;
private final TaasClassificationService taasClassificationService;
private final RedactManagerClassificationService redactManagerClassificationService;
private final DocuMineClassificationService docuMineClassificationService;
private final SimplifiedSectionTextService simplifiedSectionTextService;
private final BodyTextFrameService bodyTextFrameService;
private final RulingCleaningService rulingCleaningService;
private final TableExtractionService tableExtractionService;
private final TaasBlockificationService taasBlockificationService;
private final DocuMineBlockificationService docuMineBlockificationService;
private final RedactManagerBlockificationService redactManagerBlockificationService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -63,7 +85,9 @@ public class LayoutParsingPipeline {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
}
Document documentGraph = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
int numberOfPages = originDocument.getNumberOfPages();
layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph));
@ -88,15 +112,72 @@ public class LayoutParsingPipeline {
}
public Document parseLayout(LayoutParsingType layoutParsingType,
@SneakyThrows
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
PDDocument originDocument,
ImageServiceResponse imageServiceResponse,
TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType,
originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
ClassificationDocument classificationDocument = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>();
originDocument.setAllSecurityToBeRemoved(true);
long pageCount = originDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
classificationDocument.setPages(classificationPages);
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = originDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
stripper.setSortByPosition(true);
}
stripper.getText(originDocument);
PDRectangle pdr = pdPage.getMediaBox();
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
stripper.getRulings(),
stripper.getMinCharWidth(),
stripper.getMaxCharHeight());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
};
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
classificationPage.setImages(pdfImages.get(pageNumber));
imageServiceResponseAdapter.findOcr(classificationPage);
}
tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType);
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument);
classificationPages.add(classificationPage);
}
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
switch (layoutParsingType) {
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
@ -107,40 +188,39 @@ public class LayoutParsingPipeline {
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
return classificationDocument;
}
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
public Document parseLayoutWithTimer(LayoutParsingType layoutParsingType,
PDDocument originDocument,
ImageServiceResponse imageServiceResponse,
TableServiceResponse tableServiceResponse) {
long start = System.currentTimeMillis();
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType,
originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
System.out.printf("parsed %d ms", System.currentTimeMillis() - start);
start = System.currentTimeMillis();
switch (layoutParsingType) {
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
if (!classificationPage.isLandscape()) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
}
System.out.printf(", classified %d ms", System.currentTimeMillis() - start);
start = System.currentTimeMillis();
sectionsBuilderService.buildSections(classificationDocument);
System.out.printf(", sections built %d ms", System.currentTimeMillis() - start);
start = System.currentTimeMillis();
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start);
return document;
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
}
private void buildPageStatistics(ClassificationPage classificationPage) {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getSequences() == null) {
continue;
}
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize());
classificationPage.getFontStyleCounter().add(word.getFontStyle());
}
}
}
}
}

View File

@ -5,6 +5,7 @@ import java.util.List;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import lombok.Data;
@ -35,4 +36,6 @@ public class ClassificationPage {
private float pageWidth;
private float pageHeight;
CleanRulings cleanRulings;
}

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.AllArgsConstructor;
@ -17,5 +18,5 @@ public class PageContents {
List<TextPositionSequence> sortedTextPositionSequences;
Rectangle2D cropBox;
Rectangle2D mediaBox;
List<Ruling> rulings;
}

View File

@ -28,6 +28,7 @@ public class Ruling extends Line2D.Float {
super(p1, p2);
}
public Ruling straightenVertical() {
double y1 = Math.min(getY1(), getY2());
@ -36,6 +37,7 @@ public class Ruling extends Line2D.Float {
return new Ruling(new Point2D.Double(x, y1), new Point2D.Double(x, y2));
}
public Ruling straightenHorizontal() {
double x1 = Math.min(getX1(), getX2());
@ -444,6 +446,16 @@ public class Ruling extends Line2D.Float {
}
public boolean almostMatches(Ruling ruling) {
final float TOLERANCE = 1;
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
Math.abs(ruling.getY2() - y2) < TOLERANCE;
}
private enum SOType {
VERTICAL,
HRIGHT,

View File

@ -8,6 +8,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
@ -20,6 +21,65 @@ public class BodyTextFrameService {
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
}
}
/*
private Rectangle calculateBodyTextFrameByRulings(List<ClassificationPage> pages) {
Map<ClassificationPage, List<Ruling>> potentialHeaderRulingsPerPage = new HashMap<>();
Map<ClassificationPage, List<Ruling>> potentialFooterRulingsPerPage = new HashMap<>();
for (var page : pages) {
potentialHeaderRulingsPerPage.put(page,
page.getCleanRulings()
.getHorizontal()
.stream()
.filter(ruling -> ruling.getY1() > page.getPageHeight() * 0.8)
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
.toList());
potentialFooterRulingsPerPage.put(page,
page.getCleanRulings()
.getHorizontal()
.stream()
.filter(ruling -> ruling.getY1() < page.getPageHeight() * 0.2)
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
.toList());
}
Optional<Ruling> headerRuling = potentialHeaderRulingsPerPage.values()
.stream()
.flatMap(Collection::stream)
.filter(ruling -> potentialHeaderRulingsPerPage.values()
.stream()
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
.min(Comparator.comparingDouble(Ruling::getY1));
Optional<Ruling> footerRuling = potentialFooterRulingsPerPage.values()
.stream()
.flatMap(Collection::stream)
.filter(ruling -> potentialHeaderRulingsPerPage.values()
.stream()
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
.max(Comparator.comparingDouble(Ruling::getY1));
double maxY = headerRuling.isPresent() ? headerRuling.get().y1 : pages.stream().mapToDouble(ClassificationPage::getPageHeight).max().orElse(Double.MAX_VALUE);
double minY = footerRuling.map(ruling -> ruling.y1).orElse(0F);
double maxX = pages.stream().mapToDouble(ClassificationPage::getPageWidth).max().orElse(Double.MAX_VALUE);
return new Rectangle(new Point((float) maxX, (float) maxY), (float) 0, (float) minY, -1);
}
*/
/**
* Adjusts and sets the body text frame to a page.
@ -34,7 +94,7 @@ public class BodyTextFrameService {
* @param bodyTextFrame frame that contains the main text on portrait pages
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
*/
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
private void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
@ -69,7 +129,10 @@ public class BodyTextFrameService {
* @param landscape Calculate for landscape or portrait
* @return Rectangle of the text frame
*/
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape, LayoutParsingType layoutParsingType) {
private Rectangle calculateBodyTextFrame(List<ClassificationPage> pages,
FloatFrequencyCounter documentFontSizeCounter,
boolean landscape,
LayoutParsingType layoutParsingType) {
float approximateHeaderLineCount;
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
@ -95,8 +158,8 @@ public class BodyTextFrameService {
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount){
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals(
LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) {
continue;
}

View File

@ -21,9 +21,9 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTrans
import lombok.experimental.UtilityClass;
@UtilityClass
public class TextPositionSequenceSorter {
public class PageContentExtractor {
public List<PageContents> getSortedTextPositionsWithPages(String filename) throws IOException {
public List<PageContents> getSortedPageContents(String filename) throws IOException {
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
@ -49,7 +49,8 @@ public class TextPositionSequenceSorter {
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
RectangleTransformations.toRectangle2D(pdPage.getMediaBox())));
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
stripper.getRulings()));
}
}
}

View File

@ -1,154 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class PdfParsingService {
private final RulingCleaningService rulingCleaningService;
private final TableExtractionService tableExtractionService;
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
private final TaasBlockificationService taasBlockificationService;
private final DocuMineBlockificationService docuMineBlockificationService;
private final RedactManagerBlockificationService redactManagerBlockificationService;
public ClassificationDocument parseDocument(LayoutParsingType layoutParsingType,
PDDocument originDocument,
Map<Integer, List<TableCells>> pdfTableCells,
Map<Integer, List<ClassifiedImage>> pdfImages) {
ClassificationDocument document = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>();
originDocument.setAllSecurityToBeRemoved(true);
long pageCount = originDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
parsePage(layoutParsingType, pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
}
document.setPages(classificationPages);
return document;
}
@SneakyThrows
private void parsePage(LayoutParsingType layoutParsingType,
Map<Integer, List<ClassifiedImage>> pdfImages,
PDDocument pdDocument,
Map<Integer, List<TableCells>> pdfTableCells,
ClassificationDocument document,
List<ClassificationPage> classificationPages,
int pageNumber) {
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
if(layoutParsingType.equals(LayoutParsingType.DOCUMINE)){
stripper.setSortByPosition(true);
}
stripper.getText(pdDocument);
PDRectangle pdr = pdPage.getMediaBox();
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
stripper.getRulings(),
stripper.getMinCharWidth(),
stripper.getMaxCharHeight());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
};
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
classificationPage.setImages(pdfImages.get(pageNumber));
imageServiceResponseAdapter.findOcr(classificationPage);
}
tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType);
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, document);
classificationPages.add(classificationPage);
}
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
}
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
}
private void buildPageStatistics(ClassificationPage classificationPage) {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getSequences() == null) {
continue;
}
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize());
classificationPage.getFontStyleCounter().add(word.getFontStyle());
}
}
}
}
}

View File

@ -7,14 +7,11 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
@ -25,7 +22,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class DocuMineClassificationService {
private final BodyTextFrameService bodyTextFrameService;
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
@ -33,14 +29,11 @@ public class DocuMineClassificationService {
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.DOCUMINE);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.DOCUMINE);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}

View File

@ -5,14 +5,11 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
@ -23,19 +20,14 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RedactManagerClassificationService {
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.REDACT_MANAGER);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.REDACT_MANAGER);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}

View File

@ -5,8 +5,6 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
@ -28,14 +26,13 @@ public class TaasClassificationService {
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.TAAS);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.TAAS);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}

View File

@ -152,7 +152,7 @@ public class PdfVisualisationUtility {
@SneakyThrows
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<Line2D> line2DS, Options options) {
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<? extends Line2D> line2DS, Options options) {
var pdPage = pdDocument.getPage(pageNumber - 1);
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);

View File

@ -22,10 +22,11 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
@ -46,7 +47,7 @@ public class BdrJsonBuildTest extends AbstractTest {
try (InputStream inputStream = new FileInputStream(filename)) {
try (PDDocument pdDocument = Loader.loadPDF(inputStream)) {
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse()));
}
}
}

View File

@ -34,6 +34,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.tenantcommons.TenantsClient;
import lombok.AllArgsConstructor;
@ -94,10 +95,10 @@ public class HeadlinesGoldStandardIntegrationTest {
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(pdfFileResource.getInputStream()),
new ImageServiceResponse(),
new TableServiceResponse());
new TableServiceResponse()));
var foundHeadlines = documentGraph.streamAllSubNodes()
.map(SemanticNode::getHeadline)

View File

@ -15,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import lombok.SneakyThrows;
@ -29,7 +30,7 @@ public class BuildDocumentGraphTest extends AbstractTest {
@Disabled
public void buildMetolachlor() {
Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
assertEquals(221, documentGraph.getPages().size());
assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count());
assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count());
@ -47,10 +48,10 @@ public class BuildDocumentGraphTest extends AbstractTest {
ClassPathResource fileResource = new ClassPathResource(filename);
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) {
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
pdDocument,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse());
new TableServiceResponse()));
}
}

View File

@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import lombok.SneakyThrows;
@ -54,10 +55,10 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(new FileInputStream(filename.toFile())),
new ImageServiceResponse(),
new TableServiceResponse());
new TableServiceResponse()));
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
ObjectMapper mapper = ObjectMapperFactory.create();

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
import java.awt.Color;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
@ -23,7 +24,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
@Disabled
public void visualizeMetolachlor() {
String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06";
String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
visualizePdf(filename);
}
@ -33,7 +34,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
@Disabled
public void visualizeRotatedTestDocument() {
String filename = "files/211";
String filename = "files/new/RotateTestFile.pdf";
visualizePdf(filename);
}
@ -43,7 +44,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
@Disabled
public void visualizeCraftedDocument() {
String filename = "files/crafted document";
String filename = "files/crafted document.pdf";
visualizePdf(filename);
}
@ -60,8 +61,8 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
private static void visualizeSemanticNodes(String filename, Document documentGraph, TextBlock textBlock) throws IOException {
File tmpFile = File.createTempFile(filename, "SEMANTIC_NODES_BBOX.pdf");
ClassPathResource fileResource = new ClassPathResource(filename + ".pdf");
File tmpFile = new File("/tmp/" + Path.of(filename).getFileName().toString() + "_SEMANTIC_NODES_BBOX.pdf");
ClassPathResource fileResource = new ClassPathResource(filename);
try (var fileStream = fileResource.getInputStream();//
PDDocument pdDocument = Loader.loadPDF(fileStream)//

View File

@ -21,16 +21,16 @@ import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
@ -40,7 +40,7 @@ import lombok.SneakyThrows;
public class PdfSegmentationServiceTest extends AbstractTest {
@Autowired
private PdfParsingService pdfParsingService;
private LayoutParsingPipeline layoutParsingPipeline;
@Autowired
private ObjectMapper objectMapper;
@ -57,12 +57,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@Autowired
private SectionsBuilderService sectionsBuilderService;
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(LayoutParsingType.REDACT_MANAGER,
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(new TableServiceResponse()),
imageServiceResponseAdapter.buildClassifiedImagesPerPage(new ImageServiceResponse()));
new ImageServiceResponse(),
new TableServiceResponse());
redactManagerClassificationService.classifyDocument(classificationDocument);

View File

@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.DividingCol
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
@ -30,7 +30,7 @@ class GapAcrossLinesDetectionServiceTest {
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start column detection");
@ -56,7 +56,7 @@ class GapAcrossLinesDetectionServiceTest {
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
List<PageContents> sortedTextPositionSequencesPerPage = PageContentExtractor.getSortedPageContents(filename);
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start column detection");

View File

@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.PageInformatio
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
@ -28,7 +28,7 @@ class InvisibleTableDetectionServiceTest {
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
int pageNumber = 1;
Rectangle2D tableBBox = pageContents.get(0)

View File

@ -7,7 +7,7 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import lombok.SneakyThrows;
@ -20,7 +20,7 @@ class MainBodyTextFrameExtractionServiceTest {
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getSortedPageContents(fileName);
}

View File

@ -9,12 +9,12 @@ import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class TextPositionSequenceSorterTest {
class PageContentExtractorTest {
@Test
@Disabled
@ -24,7 +24,7 @@ class TextPositionSequenceSorterTest {
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
List<PageContents> textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
textPositionPerPage.stream()

View File

@ -8,7 +8,7 @@ import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
@ -24,7 +24,7 @@ class PageInformationServiceTest {
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start gap detection");
start = System.currentTimeMillis();
@ -47,7 +47,7 @@ class PageInformationServiceTest {
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start gap detection");
start = System.currentTimeMillis();

View File

@ -0,0 +1,38 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.nio.file.Path;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
public class RulingCleaningServiceTest {
@Test
@Disabled
@SneakyThrows
public void textRulingExtraction() {
String fileName = "files/BASF/2013-1110704.pdf";
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents("files/BASF/2013-1110704.pdf");
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20));
}
}
}

View File

@ -24,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
@ -206,6 +207,21 @@ public class PdfDraw {
}
@SneakyThrows
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
try (InputStream inputStream = new ClassPathResource(filename).getInputStream();//
PDDocument pdDocument = Loader.loadPDF(inputStream);//
var out = new FileOutputStream(tmpFileName)//
) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PdfVisualisationUtility.drawLine2DList(pdDocument, pageNumber, linesPerPage.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
}
pdDocument.save(out);
}
}
@Builder
@AllArgsConstructor
@Getter