Compare commits

...

11 Commits

Author SHA1 Message Date
Dominique Eifländer
ef23ee0ade Merge branch 'RED-10752-main' into 'main'
RED-10752: Enabled prometheus

See merge request fforesight/layout-parser!267
2025-01-29 13:34:01 +01:00
Dominique Eifländer
af31f52b47 RED-10752: Enabled prometheus 2025-01-29 11:09:29 +01:00
Kilian Schüttler
b5152112ee Merge branch 'RM-231' into 'main'
RM-231: missing whitespace in name

See merge request fforesight/layout-parser!264
2025-01-14 13:04:10 +01:00
Kilian Schuettler
85ea4ef455 RM-231: missing whitespace in name 2025-01-14 12:59:01 +01:00
Kilian Schüttler
01f8c01fff Merge branch 'RED-10714' into 'main'
RED-10714: fix IndexOutOfBoundsException

See merge request fforesight/layout-parser!262
2025-01-10 12:33:18 +01:00
Kilian Schuettler
0b6a292c75 RED-10714: fix IndexOutOfBoundsException 2025-01-10 12:12:14 +01:00
Maverick Studer
e24020589c Merge branch 'feature/RED-9998' into 'main'
RED-9998: App version history (for conditional re-analyzing the layout of a file)

See merge request fforesight/layout-parser!259
2024-12-12 09:58:46 +01:00
Maverick Studer
c619b845e8 RED-9998: App version history (for conditional re-analyzing the layout of a file) 2024-12-12 09:58:46 +01:00
Kilian Schüttler
ed0371ca11 Merge branch 'RED-10127' into 'main'
RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines

See merge request fforesight/layout-parser!257
2024-12-06 14:49:48 +01:00
Kilian Schuettler
89b5be8d67 RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines 2024-12-06 13:41:44 +01:00
Kilian Schuettler
077ce60c9d RED-9139: update document version 2024-11-15 16:48:56 +01:00
18 changed files with 158 additions and 78 deletions

View File

@ -8,7 +8,7 @@ plugins {
group = "com.knecon.fforesight"
val documentVersion by rootProject.extra { "4.425.0-RED9139.13-RED9139.0-RED9139.0" }
val documentVersion by rootProject.extra { "4.433.0" }
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17

View File

@ -8,13 +8,20 @@ import lombok.Builder;
@Builder
@Schema(description = "Object containing information about the layout parsing.")
public record LayoutParsingFinishedEvent(
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.")
Map<String, String> identifier,//
@Schema(description = "The duration of a single layout parsing in ms.")
long duration,//
@Schema(description = "The number of pages of the parsed document.")
int numberOfPages,//
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.")
String message) {
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") //
Map<String, String> identifier,
@Schema(description = "The duration of a single layout parsing in ms.") //
long duration,
@Schema(description = "The number of pages of the parsed document.") //
int numberOfPages,
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
String message,
@Schema(description = "The app version of the layout parser.") //
String layoutParserVersion
) {
}

View File

@ -13,9 +13,8 @@ import lombok.experimental.FieldDefaults;
@Configuration
@ConfigurationProperties("layoutparser")
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutparserSettings {
public class LayoutParserSettings {
boolean debug;
LayoutParsingType layoutParsingTypeOverride;
String pdftronLicense;
}

View File

@ -20,6 +20,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
@ -87,29 +88,32 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutParsingPipeline {
ImageServiceResponseAdapter imageServiceResponseAdapter;
CvTableParsingAdapter cvTableParsingAdapter;
LayoutParsingStorageService layoutParsingStorageService;
SectionsBuilderService sectionsBuilderService;
SimplifiedSectionTextService simplifiedSectionTextService;
RulingCleaningService rulingCleaningService;
TableExtractionService tableExtractionService;
DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService;
BlockificationPostprocessingService blockificationPostprocessingService;
DocstrumBlockificationService docstrumBlockificationService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
GraphicExtractorService graphicExtractorService;
OutlineExtractorService outlineExtractorService;
SectionTreeBuilderService sectionTreeBuilderService;
SectionTreeEnhancementService sectionTreeEnhancementService;
LayoutparserSettings settings;
ClassificationService classificationService;
final ImageServiceResponseAdapter imageServiceResponseAdapter;
final CvTableParsingAdapter cvTableParsingAdapter;
final LayoutParsingStorageService layoutParsingStorageService;
final SectionsBuilderService sectionsBuilderService;
final SimplifiedSectionTextService simplifiedSectionTextService;
final RulingCleaningService rulingCleaningService;
final TableExtractionService tableExtractionService;
final DocuMineBlockificationService docuMineBlockificationService;
final RedactManagerBlockificationService redactManagerBlockificationService;
final BlockificationPostprocessingService blockificationPostprocessingService;
final DocstrumBlockificationService docstrumBlockificationService;
final LayoutGridService layoutGridService;
final ObservationRegistry observationRegistry;
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
final GraphicExtractorService graphicExtractorService;
final OutlineExtractorService outlineExtractorService;
final SectionTreeBuilderService sectionTreeBuilderService;
final SectionTreeEnhancementService sectionTreeEnhancementService;
final LayoutParserSettings settings;
final ClassificationService classificationService;
@Value("${LAYOUT_PARSER_VERSION:}")
private String layoutParserVersion;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -118,17 +122,23 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
.orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
.map(layoutParsingStorageService::getImagesFile)
.orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
.map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
originFile,
imageServiceResponse,
tableServiceResponse,
@ -137,18 +147,19 @@ public class LayoutParsingPipeline {
log.info("Building document graph for {}", layoutParsingRequest.identifier());
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, false);
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false);
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
if (layoutParsingRequest.documentMarkdownFileStorageId()
.isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
@ -188,6 +199,7 @@ public class LayoutParsingPipeline {
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.layoutParserVersion(layoutParserVersion)
.build();
}
@ -385,7 +397,8 @@ public class LayoutParsingPipeline {
.flatMap(Collection::stream)
.map(Character::getTextPosition)
.filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average().orElse(0);
.mapToDouble(RedTextPosition::getExactDir).average()
.orElse(0);
if (averageRotation == 0) {
continue;

View File

@ -24,7 +24,7 @@ import lombok.EqualsAndHashCode;
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class Line extends TextBoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
private static final double WORD_DISTANCE_MULTIPLIER = 0.17;
@EqualsAndHashCode.Include
private final double x0;
@ -157,6 +157,9 @@ public class Line extends TextBoundingBox {
private void computeWords(List<Character> characters, double wordSpacing) {
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
Word word = new Word();
Character previous = null;
for (Character current : characters) {

View File

@ -18,10 +18,14 @@ public class ClassificationPatterns {
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
public static final Pattern TABLE_OR_FIGURE_HEADLINE_PATTERN = Pattern.compile(
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE);
public static final Pattern TABLE_MID_SENTENCE_PATTERN = Pattern.compile(
"(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE);
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");

View File

@ -6,7 +6,8 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.clas
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_MID_SENTENCE_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_HEADLINE_PATTERN;
import java.util.ArrayList;
import java.util.Comparator;
@ -83,7 +84,8 @@ public class DocuMineClassificationService {
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString());
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_HEADLINE_PATTERN.matcher(textBlock.toString());
Matcher tableMidSentenceMatcher = TABLE_MID_SENTENCE_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
@ -148,6 +150,8 @@ public class DocuMineClassificationService {
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
&& tableMidSentenceMatcher.reset().results()
.count() <= 1 //
&& !isAmount//
&& !headlineWithSlashesMatches) {

View File

@ -71,6 +71,9 @@ public class TableOfContentsClassificationService {
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
if (start >= textBlocks.size()) {
return start;
}
ClassificationPage startPage = textBlocks.get(start).page();
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();

View File

@ -10,6 +10,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Headline;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Paragraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.OutlineMapper;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid;
@ -32,9 +33,15 @@ public class LayoutGridService {
@SneakyThrows
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
public void addLayoutGrid(File originFile, DocumentWithVisualization document, File destinationFile, boolean layerVisibilityDefaultValue) {
public void addLayoutGrid(File originFile,
DocumentWithVisualization document,
File destinationFile,
LayoutParsingType layoutParsingType,
String layoutParserVersion,
boolean layerVisibilityDefaultValue) {
LayoutGrid layoutGrid = createLayoutGrid(document.document());
String layoutParsingTypeName = layoutParsingType.name();
LayoutGrid layoutGrid = createLayoutGrid(document.document(), layoutParserVersion, layoutParsingTypeName);
Outline outline = OutlineMapper.createOutline(document.document());
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
@ -42,16 +49,23 @@ public class LayoutGridService {
document.layoutDebugLayer().addOutlineHeadlines(document.document());
if (document.layoutDebugLayer().isActive()) {
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.layoutDebugLayer()), outline);
viewerDocumentService.addLayerGroups(originFile,
destinationFile,
List.of(layoutGrid, document.layoutDebugLayer()),
layoutParserVersion,
layoutParsingTypeName,
outline);
} else {
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid), outline);
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid), layoutParserVersion, layoutParsingTypeName, outline);
}
}
private LayoutGrid createLayoutGrid(Document document) {
private LayoutGrid createLayoutGrid(Document document, String layoutParserVersion, String layoutParsingType) {
LayoutGrid layoutGrid = new LayoutGrid();
layoutGrid.addVersionAndLayoutParsingType(layoutParserVersion, layoutParsingType, document.getFirstPage());
document.streamAllSubNodes()
.peek(layoutGrid::addTreeId)
.forEach(semanticNode -> {

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.visualization;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
@ -25,6 +26,9 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNo
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
@ -89,10 +93,19 @@ public class LayoutGrid extends LayoutGridLayerConfig {
public void addTreeId(SemanticNode semanticNode) {
Page page = semanticNode.getFirstPage();
if (semanticNode.getBBox().get(page) == null) {
if (semanticNode.getBBox()
.get(page) == null) {
return;
}
addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR);
addPlacedText(page,
semanticNode.getBBox()
.get(page),
semanticNode.getBBox()
.get(page),
buildTreeIdString(semanticNode),
1,
treeIds,
TREEID_COLOR);
}
@ -121,7 +134,8 @@ public class LayoutGrid extends LayoutGridLayerConfig {
.toList();
Integer maxChildDepth = subSections.stream()
.map(node -> node.getTreeId().size())
.max(Integer::compareTo).orElse(section.getTreeId().size());
.max(Integer::compareTo)
.orElse(section.getTreeId().size());
int ownDepth = section.getTreeId().size();
Page firstPage = section.getFirstPage();
@ -307,7 +321,8 @@ public class LayoutGrid extends LayoutGridLayerConfig {
Visualizations visualizations = semanticNode.getType().equals(NodeType.TABLE_OF_CONTENTS) ? toc : sections;
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredLines();
int lineWidthModifier = maxChildDepth - ownDepth;
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
SemanticNode highestParent = semanticNode.getHighestParent();
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
@ -356,7 +371,8 @@ public class LayoutGrid extends LayoutGridLayerConfig {
List<Double> ys = yStream.collect(Collectors.toList());
ys.remove(0);
Rectangle2D tableBBox = table.getBBox().get(page);
Rectangle2D tableBBox = table.getBBox()
.get(page);
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines();
xs.forEach(x -> {
@ -398,6 +414,21 @@ public class LayoutGrid extends LayoutGridLayerConfig {
}
public void addVersionAndLayoutParsingType(String version, String layoutParsingType, Page page) {
PageInformation pageInformation = PageInformation.fromPage(page);
double startHeight = pageInformation.heightRot() - 5;
Point2D point1 = new Point2D.Double(0, startHeight);
Point2D point2 = new Point2D.Double(0, startHeight - FONT_SIZE * 1.5);
AffineTransform affineTransform = CoordinateTransforms.calculatePageCoordsToInitialUserSpaceCoords(pageInformation);
affineTransform.transform(point1, point1);
affineTransform.transform(point2, point2);
getOrCreateVisualizationsOnPage(page.getNumber(), this.versionAndType).getPlacedTexts()
.addAll(List.of(PlacedText.textFacingUp(String.valueOf(version), point1, FONT_SIZE, Color.BLACK, FONT),
PlacedText.textFacingUp(String.valueOf(layoutParsingType), point2, FONT_SIZE, Color.BLACK, FONT)));
}
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
}

View File

@ -39,6 +39,7 @@ dependencies {
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("ch.qos.logback:logback-classic")
api("com.iqser.red.commons:metric-commons:2.3.0")
implementation("com.pdftron:PDFNet:10.11.0")

View File

@ -17,7 +17,6 @@ import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
@ -80,7 +79,7 @@ public class OutlineDetectionTest extends AbstractTest {
long start = System.currentTimeMillis();
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.DOCUMINE_OLD);
var document = buildGraph(fileName, classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), LayoutParsingType.DOCUMINE_OLD, "TEST_VERSION", true);
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(2).size(), 1);

View File

@ -11,7 +11,6 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
@ -31,6 +30,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
static String TEST_VERSION = "TEST_VERSION";
@BeforeEach
public void init() {
@ -51,7 +52,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
long start = System.currentTimeMillis();
var document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, TEST_VERSION, true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@ -79,7 +80,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
var document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, TEST_VERSION, true);
}
}

View File

@ -10,6 +10,7 @@ If optionalContent is false, the layer will not be created as a OCG, and will no
*/
public record LayerIdentifier(String name, String markedContentName) {
public String markedContentName() {
// The prefix KNECON_ is used to identify marked contents as knecon contents later on
return KNECON_IDENTIFIER_PREFIX + markedContentName;
@ -40,6 +41,7 @@ public record LayerIdentifier(String name, String markedContentName) {
public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES");
public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES");
public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs");
public static final LayerIdentifier KNECON_LAYOUT_VERSION_AND_TYPE = new LayerIdentifier("Version and Type", "LAYOUT_PARSER_VERSION_AND_TYPE");
public static final LayerIdentifier KNECON_LAYOUT_TOC = new LayerIdentifier("Table of Contents", "TABLE_OF_CONTENTS");
//layout grid debug

View File

@ -46,12 +46,13 @@ public class LayoutGridLayerConfig extends AbstractLayerGroup {
protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build();
protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build();
protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build();
protected final Visualizations versionAndType = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_VERSION_AND_TYPE).build();
@Override
public List<Visualizations> getVisualizations() {
return List.of(headlines, paragraphs, tables, sections, headerFooter, toc, keyValue, figures, images, treeIds);
return List.of(headlines, paragraphs, tables, sections, headerFooter, toc, keyValue, figures, images, treeIds, versionAndType);
}
}

View File

@ -54,7 +54,7 @@ public class PDFTronViewerDocumentService {
@SneakyThrows
@Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations")
public void addLayerGroups(File originFile, File destinationFile, List<LayerGroup> layerGroups, Outline outline) {
public void addLayerGroups(File originFile, File destinationFile, List<LayerGroup> layerGroups, String layoutParserVersion, String layoutParsingType, Outline outline) {
synchronized (PDFNet.class) { // synchronized with class, to ensure multiple instances are also synchronized
@ -116,7 +116,7 @@ public class PDFTronViewerDocumentService {
// OutlineUtility.addOutline(pdfDoc, outline);
ViewerDocVersioningUtility.setVersionInDocument(pdfDoc);
ViewerDocVersioningUtility.setVersionInDocument(pdfDoc, layoutParserVersion, layoutParsingType);
saveDocument(pdfDoc, destinationFile);
} finally {
@ -128,9 +128,9 @@ public class PDFTronViewerDocumentService {
@SneakyThrows
@Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations")
public void addLayerGroups(File originFile, File destinationFile, List<LayerGroup> layerGroups) {
public void addLayerGroups(File originFile, File destinationFile, List<LayerGroup> layerGroups, String layoutParserVersion, String layoutParsingType) {
addLayerGroups(originFile, destinationFile, layerGroups, new Outline());
addLayerGroups(originFile, destinationFile, layerGroups, layoutParserVersion, layoutParsingType, new Outline());
}

View File

@ -8,6 +8,7 @@ import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.sdf.Obj;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@ -21,10 +22,15 @@ public class ViewerDocVersioningUtility {
@SneakyThrows
public void setVersionInDocument(PDFDoc pdfDoc) {
public void setVersionInDocument(PDFDoc pdfDoc, String layoutParserVersion, String layoutParsingType) {
pdfDoc.getDocInfo().setAuthor(AUTHOR);
pdfDoc.getDocInfo().setKeywords(CUSTOM_DICT + ":" + currentVersion);
Obj versionInfo = pdfDoc.getSDFDoc().createIndirectDict();
versionInfo.putString("LayoutParserVersion", layoutParserVersion);
versionInfo.putString("LayoutParsingType", layoutParsingType);
pdfDoc.getRoot().put("KneconVersionInfo", versionInfo);
}

View File

@ -1,26 +1,18 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.awt.geom.AffineTransform;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import javax.swing.table.AbstractTableModel;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.pdftron.common.Matrix2D;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@ -48,7 +40,7 @@ class ViewerDocVersioningUtilityTest {
File file = new ClassPathResource("files/empty.pdf").getFile();
Path tmpFile = Files.createTempFile("markedDocument", ".pdf");
try (var in = new FileInputStream(file); var doc = new PDFDoc(in); var out = new FileOutputStream(tmpFile.toFile())) {
ViewerDocVersioningUtility.setVersionInDocument(doc);
ViewerDocVersioningUtility.setVersionInDocument(doc, "layoutParserVersion", "layoutParsingType");
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
assert ViewerDocVersioningUtility.isCurrentVersion(tmpFile.toFile());