Merge branch 'PDFBox-update' into 'main'
upgrade PDFBox to 3.0.0 See merge request fforesight/layout-parser!52
This commit is contained in:
commit
b251697492
@ -6,7 +6,7 @@ plugins {
|
||||
description = "layoutparser-service-processor"
|
||||
|
||||
val jacksonVersion = "2.15.2"
|
||||
val pdfBoxVersion = "3.0.0-RC1"
|
||||
val pdfBoxVersion = "3.0.0"
|
||||
|
||||
dependencies {
|
||||
implementation(project(":layoutparser-service-internal-api"))
|
||||
|
||||
@ -103,7 +103,7 @@ public class LayoutParsingPipeline {
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
|
||||
try (var out = new ByteArrayOutputStream()) {
|
||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out);
|
||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
||||
}
|
||||
|
||||
|
||||
@ -13,7 +13,6 @@ import java.nio.file.StandardOpenOption;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -49,7 +48,7 @@ public class LayoutParsingStorageService {
|
||||
IOUtils.copy(originDocumentInputStream, tempFileOutputStream);
|
||||
originDocumentInputStream.close();
|
||||
}
|
||||
return Loader.loadPDF(tempFile, MemoryUsageSetting.setupMixed(67108864L));
|
||||
return Loader.loadPDF(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
@ -13,72 +13,70 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
@Service
|
||||
public class BodyTextFrameService {
|
||||
|
||||
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
|
||||
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
|
||||
|
||||
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
|
||||
|
||||
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
||||
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
private Rectangle calculateBodyTextFrameByRulings(List<ClassificationPage> pages) {
|
||||
private Rectangle getBodyTextFrameFromRulings(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
||||
|
||||
Map<ClassificationPage, List<Ruling>> potentialHeaderRulingsPerPage = new HashMap<>();
|
||||
Map<ClassificationPage, List<Ruling>> potentialFooterRulingsPerPage = new HashMap<>();
|
||||
|
||||
for (var page : pages) {
|
||||
potentialHeaderRulingsPerPage.put(page,
|
||||
page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getY1() > page.getPageHeight() * 0.8)
|
||||
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
|
||||
.toList());
|
||||
potentialFooterRulingsPerPage.put(page,
|
||||
page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getY1() < page.getPageHeight() * 0.2)
|
||||
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
|
||||
.toList());
|
||||
List<Ruling> potentialFooterRulings = getPotentialFooterRulings(page);
|
||||
List<Ruling> potentialHeaderRulings = getPotentialHeaderRulings(page);
|
||||
var x = bodyTextFrame.getTopLeft().getX();
|
||||
var y = bodyTextFrame.getTopLeft().getY();
|
||||
var w = bodyTextFrame.getWidth();
|
||||
var h = bodyTextFrame.getHeight();
|
||||
if (!potentialFooterRulings.isEmpty()) {
|
||||
h = y + h - potentialFooterRulings.get(0).getTop();
|
||||
y = potentialFooterRulings.get(0).getTop();
|
||||
}
|
||||
|
||||
Optional<Ruling> headerRuling = potentialHeaderRulingsPerPage.values()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.filter(ruling -> potentialHeaderRulingsPerPage.values()
|
||||
.stream()
|
||||
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
|
||||
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
|
||||
.min(Comparator.comparingDouble(Ruling::getY1));
|
||||
|
||||
Optional<Ruling> footerRuling = potentialFooterRulingsPerPage.values()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.filter(ruling -> potentialHeaderRulingsPerPage.values()
|
||||
.stream()
|
||||
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
|
||||
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
|
||||
.max(Comparator.comparingDouble(Ruling::getY1));
|
||||
|
||||
double maxY = headerRuling.isPresent() ? headerRuling.get().y1 : pages.stream().mapToDouble(ClassificationPage::getPageHeight).max().orElse(Double.MAX_VALUE);
|
||||
double minY = footerRuling.map(ruling -> ruling.y1).orElse(0F);
|
||||
double maxX = pages.stream().mapToDouble(ClassificationPage::getPageWidth).max().orElse(Double.MAX_VALUE);
|
||||
|
||||
return new Rectangle(new Point((float) maxX, (float) maxY), (float) 0, (float) minY, -1);
|
||||
if (!potentialHeaderRulings.isEmpty()) {
|
||||
h = potentialHeaderRulings.get(0).getBottom() - bodyTextFrame.getTopLeft().getY();
|
||||
}
|
||||
return new Rectangle(new Point(x, y), w, h, page.getPageNumber());
|
||||
}
|
||||
|
||||
|
||||
private List<Ruling> getPotentialFooterRulings(ClassificationPage page) {
|
||||
|
||||
return page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD)
|
||||
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
||||
.sorted(Comparator.comparingDouble(Ruling::getTop))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Ruling> getPotentialHeaderRulings(ClassificationPage page) {
|
||||
|
||||
return page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD))
|
||||
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
||||
.sorted(Comparator.comparingDouble(Ruling::getBottom).reversed())
|
||||
.toList();
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
@ -129,10 +127,10 @@ public class BodyTextFrameService {
|
||||
* @param landscape Calculate for landscape or portrait
|
||||
* @return Rectangle of the text frame
|
||||
*/
|
||||
private Rectangle calculateBodyTextFrame(List<ClassificationPage> pages,
|
||||
FloatFrequencyCounter documentFontSizeCounter,
|
||||
boolean landscape,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
protected Rectangle calculateBodyTextFrame(List<ClassificationPage> pages,
|
||||
FloatFrequencyCounter documentFontSizeCounter,
|
||||
boolean landscape,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
float approximateHeaderLineCount;
|
||||
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -26,35 +25,34 @@ public class PageContentExtractor {
|
||||
public List<PageContents> getSortedPageContents(String filename) throws IOException {
|
||||
|
||||
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
|
||||
try (PDDocument pdDocument = Loader.loadPDF(inputStream)) {
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile())) {
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
|
||||
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
|
||||
|
||||
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
|
||||
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
|
||||
|
||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
|
||||
stripper.getRulings()));
|
||||
}
|
||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
|
||||
stripper.getRulings()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return textPositionSequencesPerPage;
|
||||
}
|
||||
|
||||
|
||||
@ -92,28 +92,28 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
*/
|
||||
LegacyPDFStreamEngine() throws IOException {
|
||||
|
||||
addOperator(new BeginText());
|
||||
addOperator(new Concatenate());
|
||||
addOperator(new DrawObject()); // special text version
|
||||
addOperator(new EndText());
|
||||
addOperator(new SetGraphicsStateParameters());
|
||||
addOperator(new Save());
|
||||
addOperator(new Restore());
|
||||
addOperator(new NextLine());
|
||||
addOperator(new SetCharSpacing());
|
||||
addOperator(new MoveText());
|
||||
addOperator(new MoveTextSetLeading());
|
||||
addOperator(new SetFontAndSize());
|
||||
addOperator(new ShowText());
|
||||
addOperator(new ShowTextAdjusted());
|
||||
addOperator(new SetTextLeading());
|
||||
addOperator(new SetMatrix());
|
||||
addOperator(new SetTextRenderingMode());
|
||||
addOperator(new SetTextRise());
|
||||
addOperator(new SetWordSpacing());
|
||||
addOperator(new SetTextHorizontalScaling());
|
||||
addOperator(new ShowTextLine());
|
||||
addOperator(new ShowTextLineAndSpace());
|
||||
addOperator(new BeginText(this));
|
||||
addOperator(new Concatenate(this));
|
||||
addOperator(new DrawObject(this)); // special text version
|
||||
addOperator(new EndText(this));
|
||||
addOperator(new SetGraphicsStateParameters(this));
|
||||
addOperator(new Save(this));
|
||||
addOperator(new Restore(this));
|
||||
addOperator(new NextLine(this));
|
||||
addOperator(new SetCharSpacing(this));
|
||||
addOperator(new MoveText(this));
|
||||
addOperator(new MoveTextSetLeading(this));
|
||||
addOperator(new SetFontAndSize(this));
|
||||
addOperator(new ShowText(this));
|
||||
addOperator(new ShowTextAdjusted(this));
|
||||
addOperator(new SetTextLeading(this));
|
||||
addOperator(new SetMatrix(this));
|
||||
addOperator(new SetTextRenderingMode(this));
|
||||
addOperator(new SetTextRise(this));
|
||||
addOperator(new SetWordSpacing(this));
|
||||
addOperator(new SetTextHorizontalScaling(this));
|
||||
addOperator(new ShowTextLine(this));
|
||||
addOperator(new ShowTextLineAndSpace(this));
|
||||
|
||||
// load additional glyph list for Unicode mapping
|
||||
String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt";
|
||||
|
||||
@ -52,32 +52,31 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
public PDFLinesTextStripper() throws IOException {
|
||||
|
||||
super();
|
||||
this.addOperator(new SetStrokingColorSpace());
|
||||
this.addOperator(new SetNonStrokingColorSpace());
|
||||
this.addOperator(new SetLineDashPattern());
|
||||
this.addOperator(new SetStrokingDeviceGrayColor());
|
||||
this.addOperator(new SetNonStrokingDeviceGrayColor());
|
||||
this.addOperator(new SetFlatness());
|
||||
this.addOperator(new SetLineJoinStyle());
|
||||
this.addOperator(new SetLineCapStyle());
|
||||
this.addOperator(new SetStrokingDeviceCMYKColor());
|
||||
this.addOperator(new SetNonStrokingDeviceCMYKColor());
|
||||
this.addOperator(new SetLineMiterLimit());
|
||||
this.addOperator(new SetStrokingDeviceRGBColor());
|
||||
this.addOperator(new SetNonStrokingDeviceRGBColor());
|
||||
this.addOperator(new SetRenderingIntent());
|
||||
this.addOperator(new SetStrokingColor());
|
||||
this.addOperator(new SetNonStrokingColor());
|
||||
this.addOperator(new SetStrokingColorN());
|
||||
this.addOperator(new SetNonStrokingColorN());
|
||||
this.addOperator(new SetFontAndSize());
|
||||
this.addOperator(new SetLineWidth());
|
||||
this.addOperator(new SetStrokingColorSpace(this));
|
||||
this.addOperator(new SetNonStrokingColorSpace(this));
|
||||
this.addOperator(new SetLineDashPattern(this));
|
||||
this.addOperator(new SetStrokingDeviceGrayColor(this));
|
||||
this.addOperator(new SetNonStrokingDeviceGrayColor(this));
|
||||
this.addOperator(new SetFlatness(this));
|
||||
this.addOperator(new SetLineJoinStyle(this));
|
||||
this.addOperator(new SetLineCapStyle(this));
|
||||
this.addOperator(new SetStrokingDeviceCMYKColor(this));
|
||||
this.addOperator(new SetNonStrokingDeviceCMYKColor(this));
|
||||
this.addOperator(new SetLineMiterLimit(this));
|
||||
this.addOperator(new SetStrokingDeviceRGBColor(this));
|
||||
this.addOperator(new SetNonStrokingDeviceRGBColor(this));
|
||||
this.addOperator(new SetRenderingIntent(this));
|
||||
this.addOperator(new SetStrokingColor(this));
|
||||
this.addOperator(new SetNonStrokingColor(this));
|
||||
this.addOperator(new SetStrokingColorN(this));
|
||||
this.addOperator(new SetNonStrokingColorN(this));
|
||||
this.addOperator(new SetFontAndSize(this));
|
||||
this.addOperator(new SetLineWidth(this));
|
||||
|
||||
|
||||
addOperator(new BeginMarkedContentSequenceWithProperties());
|
||||
// addOperator(new BeginMarkedContentSequence());
|
||||
addOperator(new EndMarkedContentSequence());
|
||||
|
||||
addOperator(new BeginMarkedContentSequenceWithProperties(this));
|
||||
// addOperator(new BeginMarkedContentSequence(this));
|
||||
addOperator(new EndMarkedContentSequence(this));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -43,7 +43,8 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin
|
||||
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.text.TextPositionComparator;
|
||||
import org.apache.pdfbox.util.QuickSort;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
||||
|
||||
/**
|
||||
* This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
|
||||
|
||||
@ -16,6 +16,7 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
|
||||
@ -39,8 +40,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class ViewerDocumentService {
|
||||
|
||||
private static final String layerName = "Layout grid";
|
||||
|
||||
private static final String LAYER_NAME = "Layout grid";
|
||||
private static final int FONT_SIZE = 10;
|
||||
public static final float LINE_WIDTH = 1f;
|
||||
|
||||
@ -48,15 +49,15 @@ public class ViewerDocumentService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream) {
|
||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
log.info("Start Viewer Document Creation");
|
||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate);
|
||||
PDFont font = PDType1Font.HELVETICA;
|
||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||
|
||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber);
|
||||
@ -119,6 +120,7 @@ public class ViewerDocumentService {
|
||||
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
|
||||
}
|
||||
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
|
||||
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
|
||||
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
|
||||
log.info("Saved Viewer Document");
|
||||
}
|
||||
@ -145,7 +147,7 @@ public class ViewerDocumentService {
|
||||
}
|
||||
|
||||
|
||||
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate) {
|
||||
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
|
||||
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||
@ -154,13 +156,13 @@ public class ViewerDocumentService {
|
||||
catalog.setOCProperties(ocprops);
|
||||
}
|
||||
PDOptionalContentGroup layer = null;
|
||||
if (ocprops.hasGroup(layerName)) {
|
||||
layer = ocprops.getGroup(layerName);
|
||||
if (ocprops.hasGroup(LAYER_NAME)) {
|
||||
layer = ocprops.getGroup(LAYER_NAME);
|
||||
} else {
|
||||
layer = new PDOptionalContentGroup(layerName);
|
||||
layer = new PDOptionalContentGroup(LAYER_NAME);
|
||||
ocprops.addGroup(layer);
|
||||
}
|
||||
ocprops.setGroupEnabled(layer, false);
|
||||
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
|
||||
dictionariesToUpdate.add(catalog.getCOSObject());
|
||||
return layer;
|
||||
}
|
||||
|
||||
@ -12,6 +12,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
@ -71,7 +72,7 @@ public class PdfVisualisationUtility {
|
||||
|
||||
contentStream.beginText();
|
||||
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
|
||||
contentStream.setFont(PDType1Font.HELVETICA, 10);
|
||||
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10);
|
||||
contentStream.showText(string);
|
||||
contentStream.endText();
|
||||
contentStream.close();
|
||||
|
||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.util.QuickSort;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
@ -13,7 +13,7 @@ plugins {
|
||||
description = "layoutparser-service-server"
|
||||
|
||||
val jacksonVersion = "2.15.2"
|
||||
val pdfBoxVersion = "3.0.0-RC1"
|
||||
val pdfBoxVersion = "3.0.0"
|
||||
|
||||
dependencies {
|
||||
implementation(project(":layoutparser-service-processor"))
|
||||
|
||||
@ -4,10 +4,8 @@ import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
@ -46,15 +44,13 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(File filename) {
|
||||
protected Document buildGraph(File file) {
|
||||
|
||||
try (InputStream inputStream = new FileInputStream(filename)) {
|
||||
try (PDDocument pdDocument = Loader.loadPDF(inputStream)) {
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
||||
pdDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
}
|
||||
try (PDDocument pdDocument = Loader.loadPDF(file)) {
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
||||
pdDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -114,10 +110,7 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
|
||||
private static void visualizeSemanticNodes(File file, File resultingFileName, Document document, TextBlock textBlock) throws IOException {
|
||||
|
||||
try (var fileStream = new FileInputStream(file);//
|
||||
PDDocument pdDocument = Loader.loadPDF(fileStream);//
|
||||
var outputStream = new FileOutputStream(resultingFileName)//
|
||||
) {
|
||||
try (PDDocument pdDocument = Loader.loadPDF(file); var outputStream = new FileOutputStream(resultingFileName)) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, document);
|
||||
PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
||||
pdDocument.save(outputStream);
|
||||
|
||||
@ -96,7 +96,7 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(pdfFileResource.getInputStream()),
|
||||
Loader.loadPDF(pdfFileResource.getFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
|
||||
|
||||
@ -12,10 +12,11 @@ import org.springframework.core.io.ClassPathResource;
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class DocumentDataTests extends BuildDocumentGraphTest{
|
||||
public class DocumentDataTests extends BuildDocumentTest {
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void createDocumentDataForAllFiles() {
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -20,10 +19,11 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
|
||||
public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@ -56,7 +56,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(new FileInputStream(filename.toFile())),
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
|
||||
|
||||
@ -16,11 +16,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Ta
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentGraphMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class DocumentGraphMappingTest extends BuildDocumentGraphTest {
|
||||
public class DocumentGraphMappingTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
|
||||
@ -13,13 +13,14 @@ import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
||||
public class DocumentGraphVisualizationTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@ -66,9 +67,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
|
||||
File tmpFile = new File("/tmp/" + Path.of(filename).getFileName().toString() + "_SEMANTIC_NODES_BBOX.pdf");
|
||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||
|
||||
try (var fileStream = fileResource.getInputStream();//
|
||||
PDDocument pdDocument = Loader.loadPDF(fileStream)//
|
||||
) {
|
||||
try (PDDocument pdDocument = Loader.loadPDF(fileResource.getFile())) {
|
||||
log.info("drawing document");
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraph);
|
||||
PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
||||
|
||||
@ -12,10 +12,11 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class ViewerDocumentTest extends BuildDocumentGraphTest {
|
||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@ -27,8 +28,8 @@ public class ViewerDocumentTest extends BuildDocumentGraphTest {
|
||||
String fileName = "files/new/VV-511309_OCR.pdf";
|
||||
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out);
|
||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -92,7 +92,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
@ -106,7 +106,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
@ -124,7 +124,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
@ -142,7 +142,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
@ -160,7 +160,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
@ -177,7 +177,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -218,7 +218,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
@ -235,7 +235,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -250,7 +250,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
@ -269,7 +269,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 3);
|
||||
|
||||
@ -285,7 +285,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 9, 0, 0);
|
||||
@ -298,7 +298,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -312,7 +312,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 6, 7, 0);
|
||||
@ -325,7 +325,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 1);
|
||||
@ -338,7 +338,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 2, 2, 0, 0);
|
||||
@ -353,7 +353,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -369,7 +369,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -384,7 +384,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -399,7 +399,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -414,7 +414,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
@ -431,7 +431,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat_Page18.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -446,7 +446,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product_Page23.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
@ -461,7 +461,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
@ -475,7 +475,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/30 - Dicamba - Acute Oral Toxicity - Rats_Page5.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
|
||||
@ -0,0 +1,31 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class BodyTextFrameServiceTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testCalculateBodyTextFrame() {
|
||||
|
||||
String filename = "files/211.pdf";
|
||||
String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf";
|
||||
ClassificationDocument document = parseLayout(filename, LayoutParsingType.TAAS);
|
||||
PdfDraw.drawRectanglesPerPage(filename,
|
||||
document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(),
|
||||
outputFilename);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -5,7 +5,6 @@ import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
@ -19,13 +18,13 @@ import lombok.SneakyThrows;
|
||||
public class RulingCleaningServiceTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
// @Disabled
|
||||
@SneakyThrows
|
||||
public void textRulingExtraction() {
|
||||
|
||||
String fileName = "files/BASF/2013-1110704.pdf";
|
||||
String fileName = "files/211.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents("files/BASF/2013-1110704.pdf");
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
||||
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
@ -1,39 +1,35 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class BuildDocumentGraphTest extends AbstractTest {
|
||||
public abstract class BuildDocumentTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
protected LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
public void buildMetolachlor() {
|
||||
@SneakyThrows
|
||||
protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) {
|
||||
|
||||
Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
assertEquals(221, documentGraph.getPages().size());
|
||||
assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count());
|
||||
assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count());
|
||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||
prepareStorage(filename);
|
||||
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) {
|
||||
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -52,14 +48,9 @@ public class BuildDocumentGraphTest extends AbstractTest {
|
||||
} else {
|
||||
prepareStorage(filename);
|
||||
}
|
||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||
|
||||
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) {
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||
pdDocument,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse()));
|
||||
}
|
||||
return DocumentGraphFactory.buildDocumentGraph(parseLayout(filename, layoutParsingType));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -5,7 +5,6 @@ import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@ -14,6 +13,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
@ -40,10 +40,8 @@ public class PdfDraw {
|
||||
|
||||
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream();//
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);//
|
||||
var out = new FileOutputStream(tmpFileName)//
|
||||
) {
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
@ -58,11 +56,8 @@ public class PdfDraw {
|
||||
|
||||
public static void drawRectanglesPerPageNumberedByLine(String filename, List<List<List<Rectangle2D>>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream();//
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);//
|
||||
var out = new FileOutputStream(tmpFileName)//
|
||||
) {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
|
||||
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
|
||||
@ -74,13 +69,9 @@ public class PdfDraw {
|
||||
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
||||
pageNumber,
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pdDocument.save(out);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -143,7 +134,7 @@ public class PdfDraw {
|
||||
} else {
|
||||
contentStream.newLineAtOffset((float) location.getX(), (float) location.getY());
|
||||
}
|
||||
contentStream.setFont(PDType1Font.HELVETICA, 10);
|
||||
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10);
|
||||
contentStream.showText(string);
|
||||
contentStream.endText();
|
||||
contentStream.close();
|
||||
@ -184,11 +175,8 @@ public class PdfDraw {
|
||||
@SneakyThrows
|
||||
public static void drawRectanglesAndLinesPerPage(String filename, List<List<Rectangle2D>> list, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) {
|
||||
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream();//
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);//
|
||||
var out = new FileOutputStream(tmpFileName)//
|
||||
) {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
// PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
// pageNumber,
|
||||
@ -201,7 +189,6 @@ public class PdfDraw {
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -209,12 +196,13 @@ public class PdfDraw {
|
||||
@SneakyThrows
|
||||
public static void drawLinesPerPage(String filename, List<List<Ruling>> linesPerPage, String tmpFileName) {
|
||||
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream();//
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);//
|
||||
var out = new FileOutputStream(tmpFileName)//
|
||||
) {
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawLine2DList(pdDocument, pageNumber, linesPerPage.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
|
||||
PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
pageNumber,
|
||||
linesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user