TAAS-41: TAAS Document Structure
* changed TextPageBlock splitting * changed Header and Footer Classification * added TAAS Document Structure Prototype
This commit is contained in:
parent
dfdeef5812
commit
f08c4ced43
@ -1,4 +1,4 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
public class ParagraphData {
|
||||||
|
|
||||||
|
private String text;
|
||||||
|
List<Range> boldTextBoundaries;
|
||||||
|
List<Range> italicTextBoundaries;
|
||||||
|
private String classification;
|
||||||
|
|
||||||
|
private String orientation;
|
||||||
|
private int textDirection;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,5 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||||
|
|
||||||
|
public record Range(int start, int end) {
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Builder
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class ResearchDocumentData {
|
||||||
|
|
||||||
|
String originalFile;
|
||||||
|
List<StructureObject> structureObjects;
|
||||||
|
}
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class RowData {
|
||||||
|
|
||||||
|
boolean header;
|
||||||
|
List<String> cellText;
|
||||||
|
float[] bBox;
|
||||||
|
}
|
||||||
@ -0,0 +1,19 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class StructureObject {
|
||||||
|
|
||||||
|
Integer structureObjectNumber;
|
||||||
|
int page;
|
||||||
|
int stringOffset;
|
||||||
|
float[] boundingBox;
|
||||||
|
ParagraphData paragraph;
|
||||||
|
TableData table;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class TableData {
|
||||||
|
|
||||||
|
List<RowData> rowData;
|
||||||
|
Integer numberOfCols;
|
||||||
|
Integer numberOfRows;
|
||||||
|
}
|
||||||
@ -19,7 +19,7 @@ import com.knecon.fforesight.service.layoutparser.processor.classification.servi
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
@ -84,4 +84,27 @@ public class LayoutParsingService {
|
|||||||
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Document parseLayoutWithTimer(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
||||||
|
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
|
||||||
|
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||||
|
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||||
|
System.out.printf("parsed %d ms", System.currentTimeMillis() - start);
|
||||||
|
|
||||||
|
start = System.currentTimeMillis();
|
||||||
|
classificationService.classifyDocument(classificationDocument);
|
||||||
|
System.out.printf(", classified %d ms", System.currentTimeMillis() - start);
|
||||||
|
|
||||||
|
start = System.currentTimeMillis();
|
||||||
|
sectionsBuilderService.buildSections(classificationDocument);
|
||||||
|
System.out.printf(", sections built %d ms", System.currentTimeMillis() - start);
|
||||||
|
|
||||||
|
start = System.currentTimeMillis();
|
||||||
|
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||||
|
System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start);
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,11 +14,11 @@ import org.springframework.stereotype.Service;
|
|||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.iqser.red.storage.commons.service.StorageService;
|
import com.iqser.red.storage.commons.service.StorageService;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||||
|
|||||||
@ -1,21 +1,28 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.toSet;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
@Data
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@Builder
|
@Builder
|
||||||
@Data
|
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
public class TextPageBlock extends AbstractPageBlock {
|
public class TextPageBlock extends AbstractPageBlock {
|
||||||
|
|
||||||
@ -67,6 +74,64 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
return sequences.get(0).getPageWidth();
|
return sequences.get(0).getPageWidth();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
|
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
|
||||||
|
sequences = new ArrayList<>(sequences);
|
||||||
|
return fromTextPositionSequences(sequences);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||||
|
|
||||||
|
TextPageBlock textBlock = null;
|
||||||
|
|
||||||
|
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||||
|
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
|
||||||
|
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||||
|
|
||||||
|
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||||
|
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||||
|
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||||
|
fontFrequencyCounter.add(wordBlock.getFont());
|
||||||
|
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||||
|
|
||||||
|
if (textBlock == null) {
|
||||||
|
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||||
|
wordBlock.getMaxXDirAdj(),
|
||||||
|
wordBlock.getMinYDirAdj(),
|
||||||
|
wordBlock.getMaxYDirAdj(),
|
||||||
|
wordBlockList,
|
||||||
|
wordBlock.getRotation());
|
||||||
|
} else {
|
||||||
|
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||||
|
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textBlock != null) {
|
||||||
|
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||||
|
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
|
||||||
|
.stream()
|
||||||
|
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||||
|
.collect(toSet())
|
||||||
|
.size() == 1) {
|
||||||
|
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||||
|
}
|
||||||
|
return textBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the minX value in pdf coordinate system.
|
* Returns the minX value in pdf coordinate system.
|
||||||
|
|||||||
@ -8,8 +8,8 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
|
import com.dslplatform.json.JsonAttribute;
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
|
|
||||||
@ -25,7 +25,6 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Builder
|
@Builder
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@JsonIgnoreProperties({"empty"})
|
|
||||||
public class TextPositionSequence implements CharSequence {
|
public class TextPositionSequence implements CharSequence {
|
||||||
|
|
||||||
public static final int HEIGHT_PADDING = 2;
|
public static final int HEIGHT_PADDING = 2;
|
||||||
@ -36,6 +35,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
private int rotation;
|
private int rotation;
|
||||||
private float pageHeight;
|
private float pageHeight;
|
||||||
private float pageWidth;
|
private float pageWidth;
|
||||||
|
private boolean isParagraphStart;
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequence(int page) {
|
public TextPositionSequence(int page) {
|
||||||
@ -44,7 +44,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
|
||||||
|
|
||||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||||
this.page = page;
|
this.page = page;
|
||||||
@ -52,6 +52,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||||
|
this.isParagraphStart = isParagraphStart;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -141,6 +142,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
* @return the text direction adjusted minX value
|
* @return the text direction adjusted minX value
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public float getMinXDirAdj() {
|
public float getMinXDirAdj() {
|
||||||
|
|
||||||
return textPositions.get(0).getXDirAdj();
|
return textPositions.get(0).getXDirAdj();
|
||||||
@ -155,6 +157,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
* @return the text direction adjusted maxX value
|
* @return the text direction adjusted maxX value
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public float getMaxXDirAdj() {
|
public float getMaxXDirAdj() {
|
||||||
|
|
||||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
||||||
@ -169,6 +172,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public float getMinYDirAdj() {
|
public float getMinYDirAdj() {
|
||||||
|
|
||||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
||||||
@ -183,6 +187,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public float getMaxYDirAdj() {
|
public float getMaxYDirAdj() {
|
||||||
|
|
||||||
return textPositions.get(0).getYDirAdj();
|
return textPositions.get(0).getYDirAdj();
|
||||||
@ -191,6 +196,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public float getTextHeight() {
|
public float getTextHeight() {
|
||||||
|
|
||||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||||
@ -198,6 +204,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public float getHeight() {
|
public float getHeight() {
|
||||||
|
|
||||||
return getMaxYDirAdj() - getMinYDirAdj();
|
return getMaxYDirAdj() - getMinYDirAdj();
|
||||||
@ -205,6 +212,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public float getWidth() {
|
public float getWidth() {
|
||||||
|
|
||||||
return getMaxXDirAdj() - getMinXDirAdj();
|
return getMaxXDirAdj() - getMinXDirAdj();
|
||||||
@ -212,6 +220,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public String getFont() {
|
public String getFont() {
|
||||||
|
|
||||||
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
|
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
|
||||||
@ -219,6 +228,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public String getFontStyle() {
|
public String getFontStyle() {
|
||||||
|
|
||||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
||||||
@ -237,6 +247,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public float getFontSize() {
|
public float getFontSize() {
|
||||||
|
|
||||||
return textPositions.get(0).getFontSizeInPt();
|
return textPositions.get(0).getFontSizeInPt();
|
||||||
@ -244,6 +255,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
public float getSpaceWidth() {
|
public float getSpaceWidth() {
|
||||||
|
|
||||||
return textPositions.get(0).getWidthOfSpace();
|
return textPositions.get(0).getWidthOfSpace();
|
||||||
@ -260,6 +272,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
* @return bounding box of the word in Pdf Coordinate System
|
* @return bounding box of the word in Pdf Coordinate System
|
||||||
*/
|
*/
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
|
@JsonAttribute(ignore = true)
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public Rectangle getRectangle() {
|
public Rectangle getRectangle() {
|
||||||
|
|
||||||
@ -299,3 +312,4 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -76,7 +76,7 @@ import org.apache.pdfbox.util.Vector;
|
|||||||
* THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD.
|
* THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD.
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings({"PMD", "checkstyle:all"})
|
@SuppressWarnings({"PMD", "checkstyle:all"})
|
||||||
class LegacyPDFStreamEngine extends PDFStreamEngine {
|
public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||||
|
|
||||||
private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class);
|
private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class);
|
||||||
|
|
||||||
@ -126,7 +126,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
* This will initialize and process the contents of the stream.
|
* This will initialize and process the contents of the stream.
|
||||||
*
|
*
|
||||||
* @param page the page to process
|
* @param page the page to process
|
||||||
* @throws IOException if there is an error accessing the stream.
|
* @throws java.io.IOException if there is an error accessing the stream.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void processPage(PDPage page) throws IOException {
|
public void processPage(PDPage page) throws IOException {
|
||||||
@ -149,7 +149,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
* written by Ben Litchfield for PDFStreamEngine.
|
* written by Ben Litchfield for PDFStreamEngine.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,Vector displacement) throws IOException {
|
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, Vector displacement) throws IOException {
|
||||||
//
|
//
|
||||||
// legacy calculations which were previously in PDFStreamEngine
|
// legacy calculations which were previously in PDFStreamEngine
|
||||||
//
|
//
|
||||||
@ -165,7 +165,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
|
|
||||||
float displacementX = displacement.getX();
|
float displacementX = displacement.getX();
|
||||||
// the sorting algorithm is based on the width of the character. As the displacement
|
// the sorting algorithm is based on the width of the character. As the displacement
|
||||||
// for vertical characters doesn't provide any suitable value for it, we have to
|
// for vertical characters doesn't provide any suitable value for it, we have to
|
||||||
// calculate our own
|
// calculate our own
|
||||||
if (font.isVertical()) {
|
if (font.isVertical()) {
|
||||||
displacementX = font.getWidth(code) / 1000;
|
displacementX = font.getWidth(code) / 1000;
|
||||||
@ -382,3 +382,4 @@ class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.pars
|
|||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||||
@ -208,13 +207,11 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
public void writeString(String text, List<TextPosition> textPositions, boolean isParagraphStart) throws IOException {
|
||||||
|
|
||||||
int startIndex = 0;
|
int startIndex = 0;
|
||||||
RedTextPosition previous = null;
|
RedTextPosition previous = null;
|
||||||
|
|
||||||
textPositions.sort(Comparator.comparing(TextPosition::getXDirAdj));
|
|
||||||
|
|
||||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||||
|
|
||||||
if (!textPositionSequences.isEmpty()) {
|
if (!textPositionSequences.isEmpty()) {
|
||||||
@ -250,7 +247,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals("\t")))) {
|
.equals("\t")))) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
}
|
}
|
||||||
@ -260,7 +257,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals("\t")))) {
|
.equals("\t")))) {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
}
|
}
|
||||||
@ -276,11 +273,11 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
// Remove false sequence ends (whitespaces)
|
// Remove false sequence ends (whitespaces)
|
||||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||||
for (TextPosition textPosition : sublist) {
|
for (TextPosition t : sublist) {
|
||||||
textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition);
|
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
startIndex = i + 1;
|
startIndex = i + 1;
|
||||||
@ -303,7 +300,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
super.writeString(text);
|
super.writeString(text);
|
||||||
@ -328,3 +325,4 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -27,6 +27,7 @@ import java.text.Bidi;
|
|||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
@ -240,10 +241,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
document = doc;
|
document = doc;
|
||||||
output = outputStream;
|
output = outputStream;
|
||||||
if (getAddMoreFormatting()) {
|
if (getAddMoreFormatting()) {
|
||||||
paragraphEnd = lineSeparator;
|
paragraphEnd = "\n----ParagraphEnd----\n\n";
|
||||||
pageStart = lineSeparator;
|
pageStart = lineSeparator;
|
||||||
articleStart = lineSeparator;
|
articleStart = "\n----ArticelStart----\n\n";
|
||||||
articleEnd = lineSeparator;
|
articleEnd = "\n----ArticelEnd----\n\n";
|
||||||
}
|
}
|
||||||
startDocument(document);
|
startDocument(document);
|
||||||
processPages(document.getPages());
|
processPages(document.getPages());
|
||||||
@ -594,9 +595,14 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
// but this caused a lot of regression test failures. So, I'm leaving it be for
|
// but this caused a lot of regression test failures. So, I'm leaving it be for
|
||||||
// now
|
// now
|
||||||
if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) {
|
if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) {
|
||||||
writeLine(normalize(line));
|
var normalized = normalize(line);
|
||||||
line.clear();
|
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
|
||||||
|
|
||||||
|
|
||||||
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
|
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
|
||||||
|
writeLine(normalized, current.isParagraphStart);
|
||||||
|
line.clear();
|
||||||
|
|
||||||
expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
|
expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
|
||||||
maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
|
maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
|
||||||
maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
|
maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
|
||||||
@ -630,7 +636,24 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
if (startOfPage && lastPosition == null) {
|
if (startOfPage && lastPosition == null) {
|
||||||
writeParagraphStart();// not sure this is correct for RTL?
|
writeParagraphStart();// not sure this is correct for RTL?
|
||||||
}
|
}
|
||||||
|
|
||||||
line.add(new LineItem(position));
|
line.add(new LineItem(position));
|
||||||
|
|
||||||
|
// Collections.sort(line, new Comparator<LineItem>() {
|
||||||
|
//
|
||||||
|
// @Override
|
||||||
|
// public int compare(LineItem str1, LineItem str2) {
|
||||||
|
// if(null == str1.getTextPosition()) {
|
||||||
|
// return 0;
|
||||||
|
// }
|
||||||
|
// else if(null == str2.getTextPosition()) {
|
||||||
|
// return 0;
|
||||||
|
// }
|
||||||
|
// return Float.compare(str1.getTextPosition().getX(), str2.getTextPosition().getX());
|
||||||
|
// }
|
||||||
|
// });
|
||||||
|
|
||||||
|
// line.sort(Comparator.comparing(a -> a.getTextPosition() != null && a.getTextPosition().getX()));
|
||||||
}
|
}
|
||||||
maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
|
maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
|
||||||
minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
|
minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
|
||||||
@ -646,7 +669,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
}
|
}
|
||||||
// print the final line
|
// print the final line
|
||||||
if (line.size() > 0) {
|
if (line.size() > 0) {
|
||||||
writeLine(normalize(line));
|
writeLine(normalize(line), false);
|
||||||
writeParagraphEnd();
|
writeParagraphEnd();
|
||||||
}
|
}
|
||||||
endArticle();
|
endArticle();
|
||||||
@ -703,7 +726,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
* @param textPositions The TextPositions belonging to the text.
|
* @param textPositions The TextPositions belonging to the text.
|
||||||
* @throws IOException If there is an error when writing the text.
|
* @throws IOException If there is an error when writing the text.
|
||||||
*/
|
*/
|
||||||
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
protected void writeString(String text, List<TextPosition> textPositions, boolean isParagraphEnd) throws IOException {
|
||||||
|
|
||||||
writeString(text);
|
writeString(text);
|
||||||
}
|
}
|
||||||
@ -998,7 +1021,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* By default the text stripper will attempt to remove text that overlapps each other. Word paints the same
|
* By default, the text stripper will attempt to remove text that overlapps each other. Word paints the same
|
||||||
* character several times in order to make it look bold. By setting this to false all text will be extracted, which
|
* character several times in order to make it look bold. By setting this to false all text will be extracted, which
|
||||||
* means that certain sections will be duplicated, but better performance will be noticed.
|
* means that certain sections will be duplicated, but better performance will be noticed.
|
||||||
*
|
*
|
||||||
@ -1385,6 +1408,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
} else {
|
} else {
|
||||||
writeLineSeparator();
|
writeLineSeparator();
|
||||||
writeParagraphSeparator();
|
writeParagraphSeparator();
|
||||||
|
lastLineStartPosition.setEndParagraphWritten();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
writeLineSeparator();
|
writeLineSeparator();
|
||||||
@ -1428,6 +1452,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace());
|
float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace());
|
||||||
float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth());
|
float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth());
|
||||||
|
|
||||||
|
// if(xGap < 0){
|
||||||
|
// result = true;
|
||||||
|
// }
|
||||||
|
// else
|
||||||
if (yGap > newYVal) {
|
if (yGap > newYVal) {
|
||||||
result = true;
|
result = true;
|
||||||
} else if (xGap > newXVal) {
|
} else if (xGap > newXVal) {
|
||||||
@ -1636,12 +1664,13 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
* @param line a list with the words of the given line
|
* @param line a list with the words of the given line
|
||||||
* @throws IOException if something went wrong
|
* @throws IOException if something went wrong
|
||||||
*/
|
*/
|
||||||
private void writeLine(List<WordWithTextPositions> line) throws IOException {
|
private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException {
|
||||||
|
|
||||||
int numberOfStrings = line.size();
|
int numberOfStrings = line.size();
|
||||||
for (int i = 0; i < numberOfStrings; i++) {
|
for (int i = 0; i < numberOfStrings; i++) {
|
||||||
WordWithTextPositions word = line.get(i);
|
WordWithTextPositions word = line.get(i);
|
||||||
writeString(word.getText(), word.getTextPositions());
|
word.getTextPositions().sort(Comparator.comparing(TextPosition::getX));
|
||||||
|
writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
|
||||||
if (i < numberOfStrings - 1) {
|
if (i < numberOfStrings - 1) {
|
||||||
writeWordSeparator();
|
writeWordSeparator();
|
||||||
}
|
}
|
||||||
@ -1963,6 +1992,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
private boolean isHangingIndent = false;
|
private boolean isHangingIndent = false;
|
||||||
private boolean isArticleStart = false;
|
private boolean isArticleStart = false;
|
||||||
|
|
||||||
|
private boolean endParagraphWritten = false;
|
||||||
|
|
||||||
private TextPosition position = null;
|
private TextPosition position = null;
|
||||||
|
|
||||||
|
|
||||||
@ -2024,6 +2055,16 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isEndParagraphWritten() {
|
||||||
|
|
||||||
|
return endParagraphWritten;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEndParagraphWritten(){
|
||||||
|
endParagraphWritten = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the isArticleStart() flag to true.
|
* Sets the isArticleStart() flag to true.
|
||||||
*/
|
*/
|
||||||
@ -2065,3 +2106,4 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,20 +1,18 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||||
|
|
||||||
import static java.util.stream.Collectors.toSet;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
|
||||||
@ -23,95 +21,64 @@ import com.knecon.fforesight.service.layoutparser.processor.classification.utils
|
|||||||
@SuppressWarnings("all")
|
@SuppressWarnings("all")
|
||||||
public class BlockificationService {
|
public class BlockificationService {
|
||||||
|
|
||||||
static final float THRESHOLD = 1f;
|
private static final float THRESHOLD = 1f;
|
||||||
|
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f;
|
||||||
|
private static final int X_GAP_SPLIT_CONSTANT = 50;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||||
*
|
*
|
||||||
* @param textPositions The words of a page.
|
* @param textPositions The words of a page.
|
||||||
* @param horizontalRulingLines Horizontal table lines.
|
* @param horizontalRulingLines Horizontal table lines.
|
||||||
* @param verticalRulingLines Vertical table lines.
|
* @param verticalRulingLines Vertical table lines.
|
||||||
* @return Page object that contains the Textblock and text statistics.
|
* @return ClassificationPage object that contains the Textblock and text statistics.
|
||||||
*/
|
*/
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
int indexOnPage = 0;
|
List<TextPageBlock> classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines);
|
||||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
|
||||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
|
||||||
|
|
||||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
classificationTextBlocks = mergeFineGranularTextPageBlocks(classificationTextBlocks);
|
||||||
TextPositionSequence prev = null;
|
|
||||||
|
|
||||||
boolean wasSplitted = false;
|
return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList()));
|
||||||
Float splitX1 = null;
|
}
|
||||||
for (TextPositionSequence word : textPositions) {
|
|
||||||
|
|
||||||
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
|
||||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
|
||||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
|
||||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
|
||||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
|
||||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
|
||||||
|
|
||||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
private List<TextPageBlock> mergeFineGranularTextPageBlocks(List<TextPageBlock> classificationTextBlocks) {
|
||||||
|
|
||||||
Orientation prevOrientation = null;
|
if (classificationTextBlocks.isEmpty()) {
|
||||||
if (!chunkBlockList.isEmpty()) {
|
return new ArrayList<>();
|
||||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
|
||||||
}
|
|
||||||
|
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
|
||||||
indexOnPage++;
|
|
||||||
|
|
||||||
chunkBlockList.add(cb1);
|
|
||||||
chunkWords = new ArrayList<>();
|
|
||||||
|
|
||||||
if (splitByX && !isSplitByRuling) {
|
|
||||||
wasSplitted = true;
|
|
||||||
cb1.setOrientation(Orientation.LEFT);
|
|
||||||
splitX1 = word.getMinXDirAdj();
|
|
||||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
|
||||||
wasSplitted = false;
|
|
||||||
cb1.setOrientation(Orientation.RIGHT);
|
|
||||||
splitX1 = null;
|
|
||||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
|
||||||
cb1.setOrientation(Orientation.LEFT);
|
|
||||||
}
|
|
||||||
|
|
||||||
minX = 1000;
|
|
||||||
maxX = 0;
|
|
||||||
minY = 1000;
|
|
||||||
maxY = 0;
|
|
||||||
prev = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
chunkWords.add(word);
|
|
||||||
|
|
||||||
prev = word;
|
|
||||||
if (word.getMinXDirAdj() < minX) {
|
|
||||||
minX = word.getMinXDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMaxXDirAdj() > maxX) {
|
|
||||||
maxX = word.getMaxXDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMinYDirAdj() < minY) {
|
|
||||||
minY = word.getMinYDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMaxYDirAdj() > maxY) {
|
|
||||||
maxY = word.getMaxYDirAdj();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
List<TextPageBlock> currentTextBlocksToMerge = new LinkedList<>();
|
||||||
if (cb1 != null) {
|
textBlocksToMerge.add(currentTextBlocksToMerge);
|
||||||
chunkBlockList.add(cb1);
|
TextPageBlock previousTextBlock = null;
|
||||||
|
for (TextPageBlock currentTextBlock : classificationTextBlocks) {
|
||||||
|
if (previousTextBlock == null) {
|
||||||
|
currentTextBlocksToMerge.add(currentTextBlock);
|
||||||
|
previousTextBlock = currentTextBlock;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < 1;
|
||||||
|
boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < 5;
|
||||||
|
if (alignsXRight && smallYGap) {
|
||||||
|
currentTextBlocksToMerge.add(currentTextBlock);
|
||||||
|
} else {
|
||||||
|
currentTextBlocksToMerge = new LinkedList<>();
|
||||||
|
currentTextBlocksToMerge.add(currentTextBlock);
|
||||||
|
textBlocksToMerge.add(currentTextBlocksToMerge);
|
||||||
|
}
|
||||||
|
previousTextBlock = currentTextBlock;
|
||||||
}
|
}
|
||||||
|
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
||||||
|
}
|
||||||
|
|
||||||
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
|
||||||
|
private void assignOrientations(List<TextPageBlock> classificationTextBlocks) {
|
||||||
|
|
||||||
|
Iterator<TextPageBlock> itty = classificationTextBlocks.iterator();
|
||||||
|
|
||||||
TextPageBlock previousLeft = null;
|
TextPageBlock previousLeft = null;
|
||||||
TextPageBlock previousRight = null;
|
TextPageBlock previousRight = null;
|
||||||
@ -141,12 +108,13 @@ public class BlockificationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
itty = chunkBlockList.iterator();
|
itty = classificationTextBlocks.iterator();
|
||||||
TextPageBlock previous = null;
|
TextPageBlock previous = null;
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
TextPageBlock block = (TextPageBlock) itty.next();
|
TextPageBlock block = (TextPageBlock) itty.next();
|
||||||
|
|
||||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
|
||||||
|
block.getMaxY(),
|
||||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||||
previous.add(block);
|
previous.add(block);
|
||||||
@ -156,8 +124,95 @@ public class BlockificationService {
|
|||||||
|
|
||||||
previous = block;
|
previous = block;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return new ClassificationPage(chunkBlockList);
|
|
||||||
|
private List<TextPageBlock> constructFineGranularTextPageBlocks(List<TextPositionSequence> textPositions,
|
||||||
|
List<Ruling> horizontalRulingLines,
|
||||||
|
List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
int indexOnPage = 0;
|
||||||
|
List<TextPositionSequence> wordClusterToCombine = new ArrayList<>();
|
||||||
|
List<TextPageBlock> classificationTextBlocks = new ArrayList<>();
|
||||||
|
|
||||||
|
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||||
|
TextPositionSequence prev = null;
|
||||||
|
|
||||||
|
var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
boolean wasSplitted = false;
|
||||||
|
Float splitX1 = null;
|
||||||
|
for (TextPositionSequence word : textPositions) {
|
||||||
|
|
||||||
|
Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString());
|
||||||
|
|
||||||
|
boolean yGap = word.getMinYDirAdj() - maxY > word.getHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||||
|
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
|
||||||
|
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
|
||||||
|
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||||
|
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||||
|
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||||
|
boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||||
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
|
boolean fontChange = prev != null && (!word.getFont().equals(prev.getFont()) || !word.getFontStyle()
|
||||||
|
.equals(prev.getFontStyle()) || word.getFontSize() != prev.getFontSize());
|
||||||
|
boolean newline = prev != null && Math.abs(word.getMinYDirAdj() - prev.getMinYDirAdj()) > word.getHeight();
|
||||||
|
boolean isListIdentifier = listIdentifierPattern.matches();
|
||||||
|
|
||||||
|
if (prev != null && (prev.isParagraphStart() || negativeXGap || positiveXGapInline || yGap || startFromTop || splitByRuling || (newline && (fontChange || isListIdentifier)))) {
|
||||||
|
// if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||||
|
|
||||||
|
Orientation prevOrientation = null;
|
||||||
|
if (!classificationTextBlocks.isEmpty()) {
|
||||||
|
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - 1).getOrientation();
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
|
||||||
|
|
||||||
|
classificationTextBlocks.add(classificationTextBlock);
|
||||||
|
wordClusterToCombine = new ArrayList<>();
|
||||||
|
|
||||||
|
if (positiveXGapInline && !splitByRuling) {
|
||||||
|
wasSplitted = true;
|
||||||
|
classificationTextBlock.setOrientation(Orientation.LEFT);
|
||||||
|
splitX1 = word.getMinXDirAdj();
|
||||||
|
} else if (newLineAfterSplit && !splitByRuling) {
|
||||||
|
wasSplitted = false;
|
||||||
|
classificationTextBlock.setOrientation(Orientation.RIGHT);
|
||||||
|
splitX1 = null;
|
||||||
|
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (yGap || !startFromTop || !positiveXGapInline || !newLineAfterSplit || !splitByRuling)) {
|
||||||
|
classificationTextBlock.setOrientation(Orientation.LEFT);
|
||||||
|
}
|
||||||
|
|
||||||
|
minX = 1000;
|
||||||
|
maxX = 0;
|
||||||
|
minY = 1000;
|
||||||
|
maxY = 0;
|
||||||
|
prev = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
wordClusterToCombine.add(word);
|
||||||
|
|
||||||
|
prev = word;
|
||||||
|
if (word.getMinXDirAdj() < minX) {
|
||||||
|
minX = word.getMinXDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMaxXDirAdj() > maxX) {
|
||||||
|
maxX = word.getMaxXDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMinYDirAdj() < minY) {
|
||||||
|
minY = word.getMinYDirAdj();
|
||||||
|
}
|
||||||
|
if (word.getMaxYDirAdj() > maxY) {
|
||||||
|
maxY = word.getMaxYDirAdj();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
|
||||||
|
if (classificationTextBlock != null) {
|
||||||
|
classificationTextBlocks.add(classificationTextBlock);
|
||||||
|
}
|
||||||
|
return classificationTextBlocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -167,53 +222,6 @@ public class BlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
|
|
||||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
|
||||||
|
|
||||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
|
||||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
|
||||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
|
||||||
fontFrequencyCounter.add(wordBlock.getFont());
|
|
||||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
|
||||||
|
|
||||||
if (textBlock == null) {
|
|
||||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
|
||||||
wordBlock.getMaxXDirAdj(),
|
|
||||||
wordBlock.getMinYDirAdj(),
|
|
||||||
wordBlock.getMaxYDirAdj(),
|
|
||||||
wordBlockList,
|
|
||||||
wordBlock.getRotation());
|
|
||||||
} else {
|
|
||||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null) {
|
|
||||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
|
||||||
}
|
|
||||||
return textBlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float minX,
|
private boolean isSplitByRuling(float minX,
|
||||||
float minY,
|
float minY,
|
||||||
float maxX,
|
float maxX,
|
||||||
@ -253,7 +261,7 @@ public class BlockificationService {
|
|||||||
verticalRulingLines,
|
verticalRulingLines,
|
||||||
word.getDir().getDegrees(),
|
word.getDir().getDegrees(),
|
||||||
word.getPageWidth(),
|
word.getPageWidth(),
|
||||||
word.getPageHeight());
|
word.getPageHeight()); //
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -268,11 +276,5 @@ public class BlockificationService {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private double round(float value, int decimalPoints) {
|
|
||||||
|
|
||||||
var d = Math.pow(10, decimalPoints);
|
|
||||||
return Math.round(value * d) / d;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,7 @@ import com.knecon.fforesight.service.layoutparser.processor.classification.utils
|
|||||||
@Service
|
@Service
|
||||||
public class BodyTextFrameService {
|
public class BodyTextFrameService {
|
||||||
|
|
||||||
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f;
|
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.0f;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -57,12 +57,9 @@ public class ClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
||||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||||
|
|||||||
@ -136,6 +136,14 @@ public class TableExtractionService {
|
|||||||
|
|
||||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
for (Ruling r : horizontalRulingLines) {
|
||||||
|
if (r.getX2() < r.getX1()) {
|
||||||
|
double a = r.getX2();
|
||||||
|
r.x2 = (float) r.getX1();
|
||||||
|
r.x1 = (float) a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
List<Cell> cellsFound = new ArrayList<>();
|
List<Cell> cellsFound = new ArrayList<>();
|
||||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
||||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||||
|
|||||||
@ -1,56 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@UtilityClass
|
|
||||||
public class FileUtils {
|
|
||||||
|
|
||||||
public File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
|
|
||||||
|
|
||||||
File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile();
|
|
||||||
setRWPermissionsOnlyForOwner(tempFile);
|
|
||||||
|
|
||||||
return tempFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Deletes a file; logs a message with the reason if the deletion fails.
|
|
||||||
* This method is null-safe.
|
|
||||||
*
|
|
||||||
* @param file The file to delete. Can be null.
|
|
||||||
*/
|
|
||||||
public void deleteFile(File file) {
|
|
||||||
|
|
||||||
if (file != null) {
|
|
||||||
try {
|
|
||||||
Files.deleteIfExists(file.toPath());
|
|
||||||
} catch (IOException ex) {
|
|
||||||
log.warn("Could not delete file!", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// We don't need to check the results of the permission setters below,
|
|
||||||
// since we're manipulating a file we created ourselves.
|
|
||||||
@SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"})
|
|
||||||
private void setRWPermissionsOnlyForOwner(File tempFile) {
|
|
||||||
|
|
||||||
try {
|
|
||||||
tempFile.setReadable(true, true);
|
|
||||||
tempFile.setWritable(true, true);
|
|
||||||
tempFile.setExecutable(false);
|
|
||||||
} catch (SecurityException ex) {
|
|
||||||
// This should never happen since we're creating a temp file ourselves.
|
|
||||||
log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -4,6 +4,8 @@ import java.awt.geom.Rectangle2D;
|
|||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
@ -16,7 +18,9 @@ public class SearchTextWithTextPositionDto {
|
|||||||
|
|
||||||
String searchText;
|
String searchText;
|
||||||
List<Integer> lineBreaks;
|
List<Integer> lineBreaks;
|
||||||
List<Integer> stringCoordsToPositionCoords;
|
List<Integer> stringIdxToPositionIdx;
|
||||||
|
List<Boundary> boldTextBoundaries;
|
||||||
|
List<Boundary> italicTextBoundaries;
|
||||||
List<Rectangle2D> positions;
|
List<Rectangle2D> positions;
|
||||||
|
|
||||||
|
|
||||||
@ -26,7 +30,7 @@ public class SearchTextWithTextPositionDto {
|
|||||||
.searchText("")
|
.searchText("")
|
||||||
.lineBreaks(Collections.emptyList())
|
.lineBreaks(Collections.emptyList())
|
||||||
.positions(Collections.emptyList())
|
.positions(Collections.emptyList())
|
||||||
.stringCoordsToPositionCoords(Collections.emptyList())
|
.stringIdxToPositionIdx(Collections.emptyList())
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.factory;
|
|||||||
|
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
@ -9,6 +10,7 @@ import java.util.Objects;
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@ -24,7 +26,7 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
|
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
|
||||||
|
|
||||||
|
|
||||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
|
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
||||||
return SearchTextWithTextPositionDto.empty();
|
return SearchTextWithTextPositionDto.empty();
|
||||||
@ -69,8 +71,10 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
return SearchTextWithTextPositionDto.builder()
|
return SearchTextWithTextPositionDto.builder()
|
||||||
.searchText(context.stringBuilder.toString())
|
.searchText(context.stringBuilder.toString())
|
||||||
.lineBreaks(context.lineBreaksStringIdx)
|
.lineBreaks(context.lineBreaksStringIdx)
|
||||||
.stringCoordsToPositionCoords(context.stringIdxToPositionIdx)
|
.stringIdxToPositionIdx(context.stringIdxToPositionIdx)
|
||||||
.positions(positions)
|
.positions(positions)
|
||||||
|
.boldTextBoundaries(mergeToBoundaries(context.boldTextsStringIdx))
|
||||||
|
.italicTextBoundaries(mergeToBoundaries(context.italicTextStringIdx))
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,6 +86,8 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
// unicode characters with more than 16-bit encoding have a length > 1 in java strings
|
// unicode characters with more than 16-bit encoding have a length > 1 in java strings
|
||||||
for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) {
|
for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) {
|
||||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||||
|
addTextPositionWithFontType(currentTextPosition, "bold", context.boldTextsStringIdx, context.stringIdx);
|
||||||
|
addTextPositionWithFontType(currentTextPosition, "italic", context.italicTextStringIdx, context.stringIdx);
|
||||||
}
|
}
|
||||||
context.stringIdx += currentTextPosition.getUnicode().length();
|
context.stringIdx += currentTextPosition.getUnicode().length();
|
||||||
}
|
}
|
||||||
@ -103,6 +109,33 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
|
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
|
||||||
|
|
||||||
|
if (integers.isEmpty()) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
List<Boundary> boundaries = new LinkedList<>();
|
||||||
|
int start = integers.get(0);
|
||||||
|
int end = integers.get(0) + 1;
|
||||||
|
for (int current : integers) {
|
||||||
|
if (current > end + 1) {
|
||||||
|
boundaries.add(new Boundary(start, end));
|
||||||
|
start = current;
|
||||||
|
}
|
||||||
|
end = current + 1;
|
||||||
|
}
|
||||||
|
if (boundaries.isEmpty())
|
||||||
|
boundaries.add(new Boundary(start, end));
|
||||||
|
return boundaries;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void addTextPositionWithFontType(RedTextPosition currentTextPosition, String fontType, List<Integer> fontTypePositions, int stringIdx) {
|
||||||
|
|
||||||
|
if (currentTextPosition.getFontName().toLowerCase().contains(fontType)) {
|
||||||
|
fontTypePositions.add(stringIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
||||||
|
|
||||||
@ -173,6 +206,8 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
|
|
||||||
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
|
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
|
||||||
List<Integer> lineBreaksStringIdx = new LinkedList<>();
|
List<Integer> lineBreaksStringIdx = new LinkedList<>();
|
||||||
|
List<Integer> boldTextsStringIdx = new LinkedList<>();
|
||||||
|
List<Integer> italicTextStringIdx = new LinkedList<>();
|
||||||
StringBuilder stringBuilder = new StringBuilder();
|
StringBuilder stringBuilder = new StringBuilder();
|
||||||
|
|
||||||
int stringIdx;
|
int stringIdx;
|
||||||
|
|||||||
@ -26,12 +26,33 @@ public class TextBlockFactory {
|
|||||||
|
|
||||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
|
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
|
||||||
|
|
||||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences);
|
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences);
|
||||||
int offset = stringOffset;
|
int offset = stringOffset;
|
||||||
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
|
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
|
||||||
long idx = textBlockIdx;
|
long idx = textBlockIdx;
|
||||||
textBlockIdx++;
|
textBlockIdx++;
|
||||||
return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page);
|
String orientation;
|
||||||
|
int textDirection;
|
||||||
|
if (sequences.isEmpty()) {
|
||||||
|
orientation = null;
|
||||||
|
textDirection = 0;
|
||||||
|
} else {
|
||||||
|
orientation = sequences.get(0).getDir().toString();
|
||||||
|
textDirection = sequences.get(0).getRotation();
|
||||||
|
}
|
||||||
|
return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
|
||||||
|
searchTextWithTextPositionDto.getLineBreaks(),
|
||||||
|
searchTextWithTextPositionDto.getBoldTextBoundaries(),
|
||||||
|
searchTextWithTextPositionDto.getItalicTextBoundaries(),
|
||||||
|
searchTextWithTextPositionDto.getPositions(),
|
||||||
|
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
|
||||||
|
idx,
|
||||||
|
parent,
|
||||||
|
numberOnPage,
|
||||||
|
page,
|
||||||
|
offset,
|
||||||
|
orientation,
|
||||||
|
textDirection);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -6,8 +6,6 @@ import java.util.Collection;
|
|||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
|
||||||
|
|
||||||
import lombok.EqualsAndHashCode;
|
import lombok.EqualsAndHashCode;
|
||||||
import lombok.Setter;
|
import lombok.Setter;
|
||||||
|
|
||||||
@ -138,26 +136,4 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces.
|
|
||||||
*
|
|
||||||
* @param textBlock TextBlock to check whitespaces against
|
|
||||||
* @return boundary
|
|
||||||
*/
|
|
||||||
public Boundary trim(TextBlock textBlock) {
|
|
||||||
|
|
||||||
int trimmedStart = this.start;
|
|
||||||
while (Character.isWhitespace(textBlock.charAt(trimmedStart))) {
|
|
||||||
trimmedStart++;
|
|
||||||
}
|
|
||||||
|
|
||||||
int trimmedEnd = this.end;
|
|
||||||
while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
|
|
||||||
trimmedEnd--;
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import java.util.LinkedList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import java.util.stream.Collectors;
|
|||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.amazonaws.services.kms.model.NotFoundException;
|
import com.amazonaws.services.kms.model.NotFoundException;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.nodes;
|
|||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -10,7 +11,7 @@ import java.util.Set;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.EntityType;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.EntityType;
|
||||||
@ -59,6 +60,12 @@ public interface SemanticNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
default Page getFirstPage() {
|
||||||
|
|
||||||
|
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
|
* Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
|
||||||
*
|
*
|
||||||
@ -306,7 +313,6 @@ public interface SemanticNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity.
|
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity.
|
||||||
* It sets the fields accordingly and recursively calls this function on all its children.
|
* It sets the fields accordingly and recursively calls this function on all its children.
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import java.util.Set;
|
|||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.textblock;
|
|||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -10,9 +11,8 @@ import java.util.HashMap;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.factory.SearchTextWithTextPositionDto;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||||
@ -38,11 +38,20 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
//string coordinates
|
//string coordinates
|
||||||
Boundary boundary;
|
Boundary boundary;
|
||||||
String searchText;
|
String searchText;
|
||||||
List<Integer> lineBreaks;
|
@Builder.Default
|
||||||
|
List<Integer> lineBreaks = new ArrayList<>();
|
||||||
|
@Builder.Default
|
||||||
|
List<Boundary> boldTextBoundaries = new ArrayList<>();
|
||||||
|
@Builder.Default
|
||||||
|
List<Boundary> italicTextBoundaries = new ArrayList<>();
|
||||||
|
String orientation;
|
||||||
|
int textDirection;
|
||||||
|
|
||||||
//position coordinates
|
//position coordinates
|
||||||
List<Integer> stringIdxToPositionIdx;
|
@Builder.Default
|
||||||
List<Rectangle2D> positions;
|
List<Integer> stringIdxToPositionIdx = new ArrayList<>();
|
||||||
|
@Builder.Default
|
||||||
|
List<Rectangle2D> positions = new ArrayList<>();
|
||||||
|
|
||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
SemanticNode parent;
|
SemanticNode parent;
|
||||||
@ -55,23 +64,34 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static AtomicTextBlock fromSearchTextWithTextPositionDto(SearchTextWithTextPositionDto searchTextWithTextPositionDto,
|
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
|
||||||
SemanticNode parent,
|
List<Integer> lineBreaks,
|
||||||
int stringOffset,
|
List<Boundary> boldTextBoundaries,
|
||||||
Long textBlockIdx,
|
List<Boundary> italicTextBoundaries,
|
||||||
Integer numberOnPage,
|
List<Rectangle2D> positions,
|
||||||
Page page) {
|
List<Integer> stringIdxToPositionIdx,
|
||||||
|
long idx,
|
||||||
|
SemanticNode parent,
|
||||||
|
int numberOnPage,
|
||||||
|
Page page,
|
||||||
|
int offset,
|
||||||
|
String orientation,
|
||||||
|
int textDirection) {
|
||||||
|
|
||||||
return AtomicTextBlock.builder()
|
return AtomicTextBlock.builder()
|
||||||
.id(textBlockIdx)
|
.id(idx)
|
||||||
.parent(parent)
|
.parent(parent)
|
||||||
.searchText(searchTextWithTextPositionDto.getSearchText())
|
.searchText(searchText)
|
||||||
.numberOnPage(numberOnPage)
|
.numberOnPage(numberOnPage)
|
||||||
.page(page)
|
.page(page)
|
||||||
.lineBreaks(searchTextWithTextPositionDto.getLineBreaks())
|
.lineBreaks(lineBreaks)
|
||||||
.positions(searchTextWithTextPositionDto.getPositions())
|
.boldTextBoundaries(boldTextBoundaries)
|
||||||
.stringIdxToPositionIdx(searchTextWithTextPositionDto.getStringCoordsToPositionCoords())
|
.italicTextBoundaries(italicTextBoundaries)
|
||||||
.boundary(new Boundary(stringOffset, stringOffset + searchTextWithTextPositionDto.getSearchText().length()))
|
.positions(positions)
|
||||||
|
.stringIdxToPositionIdx(stringIdxToPositionIdx)
|
||||||
|
.boundary(new Boundary(offset, offset + searchText.length()))
|
||||||
|
.textDirection(textDirection)
|
||||||
|
.orientation(orientation)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,11 +102,8 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
.id(textBlockIdx)
|
.id(textBlockIdx)
|
||||||
.boundary(new Boundary(stringOffset, stringOffset))
|
.boundary(new Boundary(stringOffset, stringOffset))
|
||||||
.searchText("")
|
.searchText("")
|
||||||
.lineBreaks(Collections.emptyList())
|
|
||||||
.page(page)
|
.page(page)
|
||||||
.numberOnPage(numberOnPage)
|
.numberOnPage(numberOnPage)
|
||||||
.stringIdxToPositionIdx(Collections.emptyList())
|
|
||||||
.positions(Collections.emptyList())
|
|
||||||
.parent(parent)
|
.parent(parent)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.textblock;
|
|||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
@ -182,4 +183,38 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
return getSearchText();
|
return getSearchText();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Boundary> getBoldTextBoundaries() {
|
||||||
|
|
||||||
|
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Boundary> getItalicTextBoundaries() {
|
||||||
|
|
||||||
|
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getOrientation() {
|
||||||
|
|
||||||
|
if (atomicTextBlocks.isEmpty()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return atomicTextBlocks.get(0).getOrientation();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getTextDirection() {
|
||||||
|
|
||||||
|
if (atomicTextBlocks.isEmpty()) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return atomicTextBlocks.get(0).getTextDirection();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -21,6 +21,18 @@ public interface TextBlock extends CharSequence {
|
|||||||
List<AtomicTextBlock> getAtomicTextBlocks();
|
List<AtomicTextBlock> getAtomicTextBlocks();
|
||||||
|
|
||||||
|
|
||||||
|
List<Boundary> getBoldTextBoundaries();
|
||||||
|
|
||||||
|
|
||||||
|
List<Boundary> getItalicTextBoundaries();
|
||||||
|
|
||||||
|
|
||||||
|
String getOrientation();
|
||||||
|
|
||||||
|
|
||||||
|
int getTextDirection();
|
||||||
|
|
||||||
|
|
||||||
Boundary getBoundary();
|
Boundary getBoundary();
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,15 +1,15 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.mapper;
|
package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.mapper;
|
package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
@ -7,11 +7,11 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer;
|
||||||
@ -1,4 +1,4 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.mapper;
|
package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -0,0 +1,108 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.mapper.taas;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ParagraphData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
public class TaasDocumentDataMapper {
|
||||||
|
|
||||||
|
public static ResearchDocumentData fromDocument(Document document) {
|
||||||
|
AtomicInteger structureObjectNumber = new AtomicInteger();
|
||||||
|
List<StructureObject> structureObjects = document.streamAllSubNodes()
|
||||||
|
.filter(node -> !node.getType().equals(NodeType.TABLE_CELL))
|
||||||
|
.filter(node -> !node.getType().equals(NodeType.SECTION))
|
||||||
|
.map(node -> {
|
||||||
|
if (node.getType().equals(NodeType.TABLE)) {
|
||||||
|
return TaasDocumentDataMapper.fromTableWithTableData((Table) node, structureObjectNumber.getAndIncrement());
|
||||||
|
} else {
|
||||||
|
return TaasDocumentDataMapper.fromSemanticNodeWithParagraphData(node, structureObjectNumber.getAndIncrement());
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.toList();
|
||||||
|
return ResearchDocumentData.builder().structureObjects(structureObjects).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ParagraphData fromTextBlock(String classification, TextBlock textBlock) {
|
||||||
|
|
||||||
|
return ParagraphData.builder()
|
||||||
|
.boldTextBoundaries(textBlock.getBoldTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList())
|
||||||
|
.italicTextBoundaries(textBlock.getItalicTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList())
|
||||||
|
.text(textBlock.getSearchText())
|
||||||
|
.classification(classification)
|
||||||
|
.orientation(textBlock.getOrientation())
|
||||||
|
.textDirection(textBlock.getTextDirection())
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static TableData fromTable(Table table) {
|
||||||
|
|
||||||
|
List<RowData> rowData = IntStream.range(0, table.getNumberOfRows()).boxed().map(rowIdx -> table.streamRow(rowIdx).toList()).map(TaasDocumentDataMapper::fromTableCells).toList();
|
||||||
|
return new TableData(rowData, table.getNumberOfCols(), table.getNumberOfRows());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static RowData fromTableCells(List<TableCell> tableCells) {
|
||||||
|
|
||||||
|
if (tableCells.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("no table cells provided");
|
||||||
|
}
|
||||||
|
boolean header = tableCells.stream().allMatch(TableCell::isHeader);
|
||||||
|
Page firstPage = tableCells.get(0).getFirstPage();
|
||||||
|
Rectangle2D bBox = tableCells.stream().map(TableCell::getBBox).reduce((map1, map2) -> {
|
||||||
|
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
|
||||||
|
return map2;
|
||||||
|
}).orElseThrow().get(firstPage);
|
||||||
|
List<String> cellText = tableCells.stream().map(TableCell::getTextBlock).map(TextBlock::getSearchText).toList();
|
||||||
|
return new RowData(header, cellText, toFloatArray(bBox));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static StructureObject fromSemanticNodeWithParagraphData(SemanticNode semanticNode, Integer structureObjectNumber) {
|
||||||
|
|
||||||
|
Page page = semanticNode.getFirstPage();
|
||||||
|
Rectangle2D bBox = semanticNode.getBBox().get(page);
|
||||||
|
return StructureObject.builder()
|
||||||
|
.structureObjectNumber(structureObjectNumber)
|
||||||
|
.boundingBox(toFloatArray(bBox))
|
||||||
|
.stringOffset(semanticNode.getBoundary().start())
|
||||||
|
.page(page.getNumber())
|
||||||
|
.paragraph(TaasDocumentDataMapper.fromTextBlock(semanticNode.getType().toString().toLowerCase(Locale.ROOT), semanticNode.getTextBlock()))
|
||||||
|
.table(null)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static StructureObject fromTableWithTableData(Table table, int structureObjectNumber) {
|
||||||
|
|
||||||
|
Page page = table.getFirstPage();
|
||||||
|
Rectangle2D bBox = table.getBBox().get(page);
|
||||||
|
return StructureObject.builder()
|
||||||
|
.structureObjectNumber(structureObjectNumber)
|
||||||
|
.boundingBox(toFloatArray(bBox))
|
||||||
|
.stringOffset(table.getBoundary().start())
|
||||||
|
.page(page.getNumber())
|
||||||
|
.paragraph(null)
|
||||||
|
.table(TaasDocumentDataMapper.fromTable(table))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static float[] toFloatArray(Rectangle2D bBox) {
|
||||||
|
|
||||||
|
return new float[]{(float) bBox.getX(), (float) bBox.getY(), (float) bBox.getWidth(), (float) bBox.getHeight()};
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -13,7 +13,7 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
|||||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||||
|
|||||||
@ -0,0 +1,103 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server;
|
||||||
|
|
||||||
|
import java.awt.Color;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
public class BdrJsonBuildTest extends BaseTest {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private ObjectMapper objectMapper;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private LayoutParsingService layoutParsingService;
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
protected Document buildGraph(File filename) {
|
||||||
|
|
||||||
|
try (InputStream inputStream = new FileInputStream(filename)) {
|
||||||
|
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||||
|
return layoutParsingService.parseLayoutWithTimer(pdDocument, new ImageServiceResponse(), new TableServiceResponse());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void writeBDRDocumentData() throws IOException {
|
||||||
|
|
||||||
|
String sourcePath = "/tmp/bdr_files";
|
||||||
|
String targetPath = "/tmp/result";
|
||||||
|
Paths.get(targetPath).toFile().mkdirs();
|
||||||
|
|
||||||
|
List<File> files = Files.walk(Paths.get(sourcePath)).filter(currentPath -> currentPath.toString().endsWith(".pdf")).map(Path::toFile).toList();
|
||||||
|
|
||||||
|
System.out.printf("Found %d files \n", files.size());
|
||||||
|
for (int i = 0; i < files.size(); i++) {
|
||||||
|
System.out.println(i + ": " + files.get(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
for (var file : files) {
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
System.out.println("Starting Structure Analysis for: " + file);
|
||||||
|
Document document = buildGraph(file);
|
||||||
|
|
||||||
|
long start2 = System.currentTimeMillis();
|
||||||
|
ResearchDocumentData researchDocumentData = TaasDocumentDataMapper.fromDocument(document);
|
||||||
|
researchDocumentData.setOriginalFile(file.toString());
|
||||||
|
System.out.printf(", mapped to research data %d ms \n", System.currentTimeMillis() - start2);
|
||||||
|
|
||||||
|
File jsonFile = Paths.get(targetPath, file.getName().replace(".pdf", ".json")).toFile();
|
||||||
|
try (FileOutputStream fileOutputStream = new FileOutputStream(jsonFile)) {
|
||||||
|
System.out.println("json written to: " + jsonFile);
|
||||||
|
fileOutputStream.write(objectMapper.writeValueAsBytes(researchDocumentData));
|
||||||
|
}
|
||||||
|
File visualizationFile = Paths.get(targetPath, file.getName().replace(".pdf", "_BBOX.pdf")).toFile();
|
||||||
|
visualizeSemanticNodes(file, visualizationFile, document, document.getTextBlock());
|
||||||
|
System.out.println("visualization pdf written to: " + visualizationFile);
|
||||||
|
System.out.printf("Full analysis and file creation took %s\n\n", Duration.ofMillis(System.currentTimeMillis() - start));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void visualizeSemanticNodes(File file, File resultingFileName, Document document, TextBlock textBlock) throws IOException {
|
||||||
|
|
||||||
|
try (var fileStream = new FileInputStream(file); var outputStream = new FileOutputStream(resultingFileName)) {
|
||||||
|
PDDocument pdDocument = Loader.loadPDF(fileStream);
|
||||||
|
PdfDraw.drawDocumentGraph(pdDocument, document);
|
||||||
|
PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
||||||
|
pdDocument.save(outputStream);
|
||||||
|
pdDocument.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -8,9 +8,9 @@ import org.junit.jupiter.api.Test;
|
|||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
|||||||
@ -2,11 +2,11 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
|
|||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentGraphMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentGraphMapper;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@ import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
|||||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||||
import org.apache.pdfbox.util.Matrix;
|
import org.apache.pdfbox.util.Matrix;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user