RED-7074: Design Subsection section tree structure algorithm
* improved merging of headlines as well as splitting logic so that more headlines are detected correctly
This commit is contained in:
parent
2fcaeb3d8c
commit
1856fed640
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
|||||||
|
|
||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -210,15 +211,15 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||||
|
|
||||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||||
numberOfPages,
|
numberOfPages,
|
||||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -239,6 +240,7 @@ public class LayoutParsingPipeline {
|
|||||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
|
OutlineObject lastProcessedOutlineObject = null;
|
||||||
|
|
||||||
// parsing the structure elements could be useful as well
|
// parsing the structure elements could be useful as well
|
||||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||||
@ -307,11 +309,16 @@ public class LayoutParsingPipeline {
|
|||||||
classificationPage.setPageWidth(cropbox.getWidth());
|
classificationPage.setPageWidth(cropbox.getWidth());
|
||||||
classificationPage.setPageHeight(cropbox.getHeight());
|
classificationPage.setPageHeight(cropbox.getHeight());
|
||||||
|
|
||||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
|
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||||
.get(pageNumber - 1);
|
|
||||||
if (outlineObjects != null) {
|
OutlineObject notFoundOutlineObject = null;
|
||||||
|
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||||
|
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
|
||||||
|
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||||
|
}
|
||||||
|
if (!outlineObjects.isEmpty()) {
|
||||||
classificationPage.setOutlineObjects(outlineObjects);
|
classificationPage.setOutlineObjects(outlineObjects);
|
||||||
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage);
|
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||||
}
|
}
|
||||||
|
|
||||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
|
|||||||
@ -5,16 +5,27 @@ import java.awt.geom.Point2D;
|
|||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@NoArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class OutlineObject {
|
public class OutlineObject {
|
||||||
|
|
||||||
private String title;
|
private final String title;
|
||||||
private int pageNumber;
|
private final int pageNumber;
|
||||||
private Point2D point;
|
private Point2D point;
|
||||||
private int treeDepth;
|
private final int treeDepth;
|
||||||
|
|
||||||
|
private boolean found = false;
|
||||||
|
|
||||||
|
|
||||||
|
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
||||||
|
|
||||||
|
this(title, pageNumber, depth);
|
||||||
|
this.point = point2D;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|||||||
@ -80,7 +80,10 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
|
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
|
||||||
|
.map(TextPageBlock::getSequences)
|
||||||
|
.flatMap(java.util.Collection::stream)
|
||||||
|
.toList();
|
||||||
sequences = new ArrayList<>(sequences);
|
sequences = new ArrayList<>(sequences);
|
||||||
return fromTextPositionSequences(sequences);
|
return fromTextPositionSequences(sequences);
|
||||||
}
|
}
|
||||||
@ -106,11 +109,11 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
if (textBlock == null) {
|
if (textBlock == null) {
|
||||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||||
wordBlock.getMaxXDirAdj(),
|
wordBlock.getMaxXDirAdj(),
|
||||||
wordBlock.getMinYDirAdj(),
|
wordBlock.getMinYDirAdj(),
|
||||||
wordBlock.getMaxYDirAdj(),
|
wordBlock.getMaxYDirAdj(),
|
||||||
wordBlockList,
|
wordBlockList,
|
||||||
wordBlock.getRotation());
|
wordBlock.getRotation());
|
||||||
} else {
|
} else {
|
||||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||||
@ -126,11 +129,12 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
|
if (textBlock != null
|
||||||
.stream()
|
&& textBlock.getSequences() != null
|
||||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
&& textBlock.getSequences()
|
||||||
.collect(toSet())
|
.stream()
|
||||||
.size() == 1) {
|
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||||
|
.collect(toSet()).size() == 1) {
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||||
}
|
}
|
||||||
return textBlock;
|
return textBlock;
|
||||||
@ -290,18 +294,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
public void add(TextPositionSequence r) {
|
public void add(TextPositionSequence r) {
|
||||||
|
|
||||||
if (r.getMinXDirAdj() < minX) {
|
setCoordinates(r);
|
||||||
minX = r.getMinXDirAdj();
|
|
||||||
}
|
|
||||||
if (r.getMaxXDirAdj() > maxX) {
|
|
||||||
maxX = r.getMaxXDirAdj();
|
|
||||||
}
|
|
||||||
if (r.getMinYDirAdj() < minY) {
|
|
||||||
minY = r.getMinYDirAdj();
|
|
||||||
}
|
|
||||||
if (r.getMaxYDirAdj() > maxY) {
|
|
||||||
maxY = r.getMaxYDirAdj();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -317,6 +310,33 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void resize() {
|
||||||
|
|
||||||
|
minX = Float.MAX_VALUE;
|
||||||
|
minY = Float.MAX_VALUE;
|
||||||
|
maxX = Float.MIN_VALUE;
|
||||||
|
maxY = Float.MIN_VALUE;
|
||||||
|
sequences.forEach(this::setCoordinates);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void setCoordinates(TextPositionSequence sequence) {
|
||||||
|
|
||||||
|
if (sequence.getMinXDirAdj() < minX) {
|
||||||
|
minX = sequence.getMinXDirAdj();
|
||||||
|
}
|
||||||
|
if (sequence.getMaxXDirAdj() > maxX) {
|
||||||
|
maxX = sequence.getMaxXDirAdj();
|
||||||
|
}
|
||||||
|
if (sequence.getMinYDirAdj() < minY) {
|
||||||
|
minY = sequence.getMinYDirAdj();
|
||||||
|
}
|
||||||
|
if (sequence.getMaxYDirAdj() > maxY) {
|
||||||
|
maxY = sequence.getMaxYDirAdj();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void set(float x1, float y1, float x2, float y2) {
|
public void set(float x1, float y1, float x2, float y2) {
|
||||||
|
|
||||||
this.minX = Math.min(x1, x2);
|
this.minX = Math.min(x1, x2);
|
||||||
|
|||||||
@ -37,50 +37,129 @@ public class BlockificationPostprocessingService {
|
|||||||
.collect(RectangleTransformations.collectBBox());
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
|
|
||||||
public void sanitizeOutlineBlocks(ClassificationPage classificationPage) {
|
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
||||||
|
|
||||||
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||||
|
|
||||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) {
|
||||||
.stream()
|
return null;
|
||||||
.filter(block -> block instanceof TextPageBlock)
|
|
||||||
.toList()
|
|
||||||
.stream()
|
|
||||||
.map(block -> (TextPageBlock) block)
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
float pageHeight = classificationPage.getPageHeight();
|
float pageHeight = classificationPage.getPageHeight();
|
||||||
|
|
||||||
for (OutlineObject outlineObject : outlineObjects) {
|
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
|
||||||
|
|
||||||
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
|
if (notFoundOutlineObject != null) {
|
||||||
|
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
|
||||||
|
|
||||||
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
OutlineObject firstOutlineObject = null;
|
||||||
while (iterator.hasNext()) {
|
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
|
||||||
TextPageBlock pageBlock = iterator.next();
|
if (outlineObjectListIterator.hasNext()) {
|
||||||
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
|
firstOutlineObject = outlineObjectListIterator.next();
|
||||||
break;
|
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||||
}
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||||
}
|
}
|
||||||
if (iterator.hasPrevious()) {
|
|
||||||
iterator.previous();
|
|
||||||
}
|
|
||||||
boolean earlyStop = false;
|
|
||||||
while (iterator.hasNext() && !earlyStop) {
|
|
||||||
TextPageBlock pageBlock = iterator.next();
|
|
||||||
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
|
||||||
}
|
|
||||||
selectMatch(classificationPage, context);
|
|
||||||
|
|
||||||
|
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
|
||||||
|
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
|
||||||
|
}
|
||||||
|
if (firstOutlineObject != null) {
|
||||||
|
// re-create the context for the updated blocks
|
||||||
|
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||||
|
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
outlineObjectListIterator.forEachRemaining(outlineObject -> {
|
||||||
|
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
|
||||||
|
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!outlineObjects.isEmpty()) {
|
||||||
|
return outlineObjects.get(outlineObjects.size() - 1);
|
||||||
|
} else {
|
||||||
|
return notFoundOutlineObject;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
private static List<TextPageBlock> getTextPageBlocks(ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
return classificationPage.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.filter(block -> block instanceof TextPageBlock)
|
||||||
|
.map(block -> (TextPageBlock) block)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) {
|
||||||
|
|
||||||
|
if (firstOutlineObjectProcessionContext == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||||
|
String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||||
|
|
||||||
|
if (!firstTitle.startsWith(notFoundTitle)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext);
|
||||||
|
var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext);
|
||||||
|
|
||||||
|
double maxYFirst = blocksOfFirstOutline.stream()
|
||||||
|
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||||
|
.max()
|
||||||
|
.orElse(Double.NEGATIVE_INFINITY);
|
||||||
|
|
||||||
|
return blocksOfNotFoundOutline.stream()
|
||||||
|
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||||
|
.anyMatch(y -> y >= maxYFirst);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<TextPageBlock> getAllMatchingBlocks(OutlineProcessionContext context) {
|
||||||
|
|
||||||
|
List<TextPageBlock> blocks = new ArrayList<>();
|
||||||
|
if (context.getDirectMatch() != null) {
|
||||||
|
blocks.add(context.getDirectMatch());
|
||||||
|
}
|
||||||
|
if (context.getSplitCandidate() != null) {
|
||||||
|
blocks.add(context.getSplitCandidate());
|
||||||
|
}
|
||||||
|
blocks.addAll(context.getMergeCandidates());
|
||||||
|
return blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
|
||||||
|
|
||||||
|
OutlineObject outlineObject = context.getOutlineObject();
|
||||||
|
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
TextPageBlock pageBlock = iterator.next();
|
||||||
|
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (iterator.hasPrevious()) {
|
||||||
|
iterator.previous();
|
||||||
|
}
|
||||||
|
boolean earlyStop = false;
|
||||||
|
while (iterator.hasNext() && !earlyStop) {
|
||||||
|
TextPageBlock pageBlock = iterator.next();
|
||||||
|
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
||||||
|
|
||||||
OutlineObject outlineObject = context.outlineObject;
|
OutlineObject outlineObject = context.outlineObject;
|
||||||
TextPageBlock directMatch = context.directMatch;
|
TextPageBlock directMatch = context.directMatch;
|
||||||
@ -122,28 +201,39 @@ public class BlockificationPostprocessingService {
|
|||||||
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
|
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
|
||||||
|
|
||||||
if (minDistance == Double.MAX_VALUE) {
|
if (minDistance == Double.MAX_VALUE) {
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
if (minDistance == distanceToDirectMatch) {
|
if (minDistance == distanceToDirectMatch) {
|
||||||
directMatch.setClassification(headlineType);
|
directMatch.setClassification(headlineType);
|
||||||
} else if (minDistance == distanceToSplitCandidate) {
|
} else if (minDistance == distanceToSplitCandidate) {
|
||||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle());
|
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
|
||||||
splitCandidate.setClassification(headlineType);
|
splitCandidate.setClassification(headlineType);
|
||||||
others.forEach(other -> other.setClassification(null));
|
others.forEach(other -> other.setClassification(null));
|
||||||
} else {
|
} else {
|
||||||
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
||||||
merged.setClassification(headlineType);
|
merged.setClassification(headlineType);
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
|
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
|
||||||
|
|
||||||
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
||||||
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
||||||
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text);
|
|
||||||
|
String headline = title;
|
||||||
|
if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||||
|
headline = sectionIdentifier + headline;
|
||||||
|
}
|
||||||
|
|
||||||
|
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
|
||||||
|
if (wordSequenceResult.inSequence.isEmpty()) {
|
||||||
|
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
|
||||||
|
}
|
||||||
|
|
||||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||||
|
blockToSplit.resize();
|
||||||
|
|
||||||
if (!wordSequenceResult.preSequence.isEmpty()) {
|
if (!wordSequenceResult.preSequence.isEmpty()) {
|
||||||
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
||||||
@ -301,6 +391,7 @@ public class BlockificationPostprocessingService {
|
|||||||
|
|
||||||
assert firstBlock != null;
|
assert firstBlock != null;
|
||||||
firstBlock.setToDuplicate(false);
|
firstBlock.setToDuplicate(false);
|
||||||
|
firstBlock.resize();
|
||||||
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -378,13 +469,13 @@ public class BlockificationPostprocessingService {
|
|||||||
if (blockTextContainsOutlineTitle) {
|
if (blockTextContainsOutlineTitle) {
|
||||||
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
|
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
|
||||||
|
|
||||||
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) {
|
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||||
|
|
||||||
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
|
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
|
||||||
context.directMatch = pageBlock;
|
context.directMatch = pageBlock;
|
||||||
return true;
|
return true;
|
||||||
} else if (context.splitCandidate == null) {
|
} else if (context.splitCandidate == null) {
|
||||||
context.sectionIdentifier = sectionIdentifier.getIdentifierString();
|
context.sectionIdentifier = sectionIdentifier;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (context.splitCandidate == null) {
|
if (context.splitCandidate == null) {
|
||||||
@ -408,7 +499,7 @@ public class BlockificationPostprocessingService {
|
|||||||
private OutlineObject outlineObject;
|
private OutlineObject outlineObject;
|
||||||
private List<TextPageBlock> mergeCandidates;
|
private List<TextPageBlock> mergeCandidates;
|
||||||
private TextPageBlock splitCandidate;
|
private TextPageBlock splitCandidate;
|
||||||
private String sectionIdentifier;
|
private SectionIdentifier sectionIdentifier;
|
||||||
|
|
||||||
|
|
||||||
public OutlineProcessionContext(OutlineObject outlineObject) {
|
public OutlineProcessionContext(OutlineObject outlineObject) {
|
||||||
@ -417,7 +508,7 @@ public class BlockificationPostprocessingService {
|
|||||||
this.directMatch = null;
|
this.directMatch = null;
|
||||||
this.mergeCandidates = new ArrayList<>();
|
this.mergeCandidates = new ArrayList<>();
|
||||||
this.splitCandidate = null;
|
this.splitCandidate = null;
|
||||||
this.sectionIdentifier = "";
|
this.sectionIdentifier = SectionIdentifier.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -58,18 +58,20 @@ public class DocstrumBlockificationService {
|
|||||||
zones.forEach(zone -> {
|
zones.forEach(zone -> {
|
||||||
|
|
||||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||||
zone.getLines().forEach(line -> {
|
zone.getLines()
|
||||||
line.getWords().forEach(word -> {
|
.forEach(line -> {
|
||||||
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
line.getWords()
|
||||||
});
|
.forEach(word -> {
|
||||||
});
|
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
|
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
|
||||||
});
|
});
|
||||||
|
|
||||||
if (xyOrder) {
|
if (xyOrder) {
|
||||||
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||||
@Override
|
@Override
|
||||||
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||||
@ -90,7 +92,7 @@ public class DocstrumBlockificationService {
|
|||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
|
|
||||||
AbstractPageBlock block = itty.next();
|
AbstractPageBlock block = itty.next();
|
||||||
if (block instanceof TablePageBlock || previous.isHeadline()) {
|
if (block instanceof TablePageBlock) {
|
||||||
previous = new TextPageBlock();
|
previous = new TextPageBlock();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -98,11 +100,21 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||||
|
|
||||||
if (current.getDir() != previous.getDir() || current.isHeadline()) {
|
if (current.getDir() != previous.getDir()) {
|
||||||
previous = current;
|
previous = current;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (current.isHeadline() || previous.isHeadline()) {
|
||||||
|
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
|
||||||
|
previous = combineBlocksAndResetIterator(previous, current, itty, false);
|
||||||
|
} else {
|
||||||
|
previous = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||||
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||||
continue;
|
continue;
|
||||||
@ -134,8 +146,8 @@ public class DocstrumBlockificationService {
|
|||||||
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||||
|
|
||||||
return current.intersectsY(previous) //
|
return current.intersectsY(previous) //
|
||||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
|
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -144,16 +156,23 @@ public class DocstrumBlockificationService {
|
|||||||
ClassificationPage page) {
|
ClassificationPage page) {
|
||||||
|
|
||||||
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
||||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||||
|
|
||||||
|
return previous.intersectsY(current)//(Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD && Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||||
|
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||||
|
|
||||||
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
||||||
&& previous.intersectsY(current) //
|
&& previous.intersectsY(current) //
|
||||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
|
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -162,7 +181,7 @@ public class DocstrumBlockificationService {
|
|||||||
previous.getSequences().addAll(current.getSequences());
|
previous.getSequences().addAll(current.getSequences());
|
||||||
previous = buildTextBlock(previous.getSequences(), 0);
|
previous = buildTextBlock(previous.getSequences(), 0);
|
||||||
previous.setToDuplicate(toDuplicate);
|
previous.setToDuplicate(toDuplicate);
|
||||||
if(current.getClassification() != null && previous.getClassification() == null) {
|
if (current.getClassification() != null && previous.getClassification() == null) {
|
||||||
previous.setClassification(current.getClassification());
|
previous.setClassification(current.getClassification());
|
||||||
}
|
}
|
||||||
itty.remove();
|
itty.remove();
|
||||||
@ -216,14 +235,14 @@ public class DocstrumBlockificationService {
|
|||||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
AbstractPageBlock block = itty.next();
|
AbstractPageBlock block = itty.next();
|
||||||
if(block == null){
|
if (block == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (block instanceof TablePageBlock) {
|
if (block instanceof TablePageBlock) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(block.getClassification() != null && block.getClassification().isHeadline()) {
|
if (block.getClassification() != null && block.getClassification().isHeadline()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -232,7 +251,7 @@ public class DocstrumBlockificationService {
|
|||||||
for (int i = 0; i < blocks.size(); i++) {
|
for (int i = 0; i < blocks.size(); i++) {
|
||||||
|
|
||||||
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
||||||
if(abstractPageBlock == null){
|
if (abstractPageBlock == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (abstractPageBlock == current) {
|
if (abstractPageBlock == current) {
|
||||||
@ -242,13 +261,12 @@ public class DocstrumBlockificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
|
if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
||||||
|
|
||||||
|
|
||||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||||
|
|
||||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||||
@ -262,8 +280,8 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
var blocksIterator = blocks.iterator();
|
var blocksIterator = blocks.iterator();
|
||||||
while(blocksIterator.hasNext()){
|
while (blocksIterator.hasNext()) {
|
||||||
if(blocksIterator.next() == null){
|
if (blocksIterator.next() == null) {
|
||||||
blocksIterator.remove();
|
blocksIterator.remove();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -351,11 +369,11 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
if (textBlock == null) {
|
if (textBlock == null) {
|
||||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||||
wordBlock.getMaxXDirAdj(),
|
wordBlock.getMaxXDirAdj(),
|
||||||
wordBlock.getMinYDirAdj(),
|
wordBlock.getMinYDirAdj(),
|
||||||
wordBlock.getMaxYDirAdj(),
|
wordBlock.getMaxYDirAdj(),
|
||||||
wordBlockList,
|
wordBlockList,
|
||||||
wordBlock.getRotation());
|
wordBlock.getRotation());
|
||||||
} else {
|
} else {
|
||||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||||
@ -371,7 +389,12 @@ public class DocstrumBlockificationService {
|
|||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
if (textBlock != null
|
||||||
|
&& textBlock.getSequences() != null
|
||||||
|
&& textBlock.getSequences()
|
||||||
|
.stream()
|
||||||
|
.map(t -> round(t.getMinYDirAdj(), 3))
|
||||||
|
.collect(toSet()).size() == 1) {
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||||
}
|
}
|
||||||
return textBlock;
|
return textBlock;
|
||||||
@ -386,38 +409,34 @@ public class DocstrumBlockificationService {
|
|||||||
List<Ruling> horizontalRulingLines,
|
List<Ruling> horizontalRulingLines,
|
||||||
List<Ruling> verticalRulingLines) {
|
List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
return isSplitByRuling(maxX,
|
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
|
||||||
minY,
|
//
|
||||||
word.getMinXDirAdj(),
|
|| isSplitByRuling(minX,
|
||||||
word.getMinYDirAdj(),
|
minY,
|
||||||
verticalRulingLines,
|
word.getMinXDirAdj(),
|
||||||
word.getDir().getDegrees(),
|
word.getMaxYDirAdj(),
|
||||||
word.getPageWidth(),
|
horizontalRulingLines,
|
||||||
word.getPageHeight()) //
|
word.getDir().getDegrees(),
|
||||||
|| isSplitByRuling(minX,
|
word.getPageWidth(),
|
||||||
minY,
|
word.getPageHeight())
|
||||||
word.getMinXDirAdj(),
|
//
|
||||||
word.getMaxYDirAdj(),
|
|| isSplitByRuling(maxX,
|
||||||
horizontalRulingLines,
|
minY,
|
||||||
word.getDir().getDegrees(),
|
word.getMinXDirAdj(),
|
||||||
word.getPageWidth(),
|
word.getMinYDirAdj(),
|
||||||
word.getPageHeight()) //
|
horizontalRulingLines,
|
||||||
|| isSplitByRuling(maxX,
|
word.getDir().getDegrees(),
|
||||||
minY,
|
word.getPageWidth(),
|
||||||
word.getMinXDirAdj(),
|
word.getPageHeight())
|
||||||
word.getMinYDirAdj(),
|
//
|
||||||
horizontalRulingLines,
|
|| isSplitByRuling(minX,
|
||||||
word.getDir().getDegrees(),
|
minY,
|
||||||
word.getPageWidth(),
|
word.getMinXDirAdj(),
|
||||||
word.getPageHeight()) //
|
word.getMaxYDirAdj(),
|
||||||
|| isSplitByRuling(minX,
|
verticalRulingLines,
|
||||||
minY,
|
word.getDir().getDegrees(),
|
||||||
word.getMinXDirAdj(),
|
word.getPageWidth(),
|
||||||
word.getMaxYDirAdj(),
|
word.getPageHeight());
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -82,13 +82,15 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
|
//String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here
|
||||||
|
|
||||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
|
||||||
String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
|
|
||||||
|
|
||||||
|
|
||||||
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
|
||||||
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||||
|
//String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||||
|
//String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf";
|
||||||
|
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||||
|
//String fileName = "files/new/UTT-Books-53.pdf";
|
||||||
|
//String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
|
||||||
|
//String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf";
|
||||||
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
|
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
|
||||||
//String fileName = "files/documine/VV-547523_LLNA.pdf";
|
//String fileName = "files/documine/VV-547523_LLNA.pdf";
|
||||||
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||||
@ -96,7 +98,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||||
//String fileName = "files/new/$100m Offers.pdf";
|
//String fileName = "files/new/$100m Offers.pdf";
|
||||||
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user