Compare commits
2 Commits
main
...
0.194.0-RE
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8f3f8845a1 | ||
|
|
a9803cf685 |
@ -17,4 +17,6 @@ public class LayoutParserSettings {
|
|||||||
|
|
||||||
boolean debug;
|
boolean debug;
|
||||||
LayoutParsingType layoutParsingTypeOverride;
|
LayoutParsingType layoutParsingTypeOverride;
|
||||||
|
String pdftronLicense;
|
||||||
|
String layoutParserVersion;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -20,7 +20,6 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
|||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
|
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
|
||||||
@ -88,32 +87,29 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class LayoutParsingPipeline {
|
public class LayoutParsingPipeline {
|
||||||
|
|
||||||
final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||||
final CvTableParsingAdapter cvTableParsingAdapter;
|
CvTableParsingAdapter cvTableParsingAdapter;
|
||||||
final LayoutParsingStorageService layoutParsingStorageService;
|
LayoutParsingStorageService layoutParsingStorageService;
|
||||||
final SectionsBuilderService sectionsBuilderService;
|
SectionsBuilderService sectionsBuilderService;
|
||||||
final SimplifiedSectionTextService simplifiedSectionTextService;
|
SimplifiedSectionTextService simplifiedSectionTextService;
|
||||||
final RulingCleaningService rulingCleaningService;
|
RulingCleaningService rulingCleaningService;
|
||||||
final TableExtractionService tableExtractionService;
|
TableExtractionService tableExtractionService;
|
||||||
final DocuMineBlockificationService docuMineBlockificationService;
|
DocuMineBlockificationService docuMineBlockificationService;
|
||||||
final RedactManagerBlockificationService redactManagerBlockificationService;
|
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||||
final BlockificationPostprocessingService blockificationPostprocessingService;
|
BlockificationPostprocessingService blockificationPostprocessingService;
|
||||||
final DocstrumBlockificationService docstrumBlockificationService;
|
DocstrumBlockificationService docstrumBlockificationService;
|
||||||
final LayoutGridService layoutGridService;
|
LayoutGridService layoutGridService;
|
||||||
final ObservationRegistry observationRegistry;
|
ObservationRegistry observationRegistry;
|
||||||
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||||
final GraphicExtractorService graphicExtractorService;
|
GraphicExtractorService graphicExtractorService;
|
||||||
final OutlineExtractorService outlineExtractorService;
|
OutlineExtractorService outlineExtractorService;
|
||||||
final SectionTreeBuilderService sectionTreeBuilderService;
|
SectionTreeBuilderService sectionTreeBuilderService;
|
||||||
final SectionTreeEnhancementService sectionTreeEnhancementService;
|
SectionTreeEnhancementService sectionTreeEnhancementService;
|
||||||
final LayoutParserSettings settings;
|
LayoutParserSettings settings;
|
||||||
final ClassificationService classificationService;
|
ClassificationService classificationService;
|
||||||
|
|
||||||
@Value("${LAYOUT_PARSER_VERSION:}")
|
|
||||||
private String layoutParserVersion;
|
|
||||||
|
|
||||||
|
|
||||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||||
@ -137,6 +133,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
|
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
|
||||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
||||||
|
String layoutParserVersion = settings.getLayoutParserVersion();
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
|
||||||
originFile,
|
originFile,
|
||||||
|
|||||||
@ -24,7 +24,7 @@ import lombok.EqualsAndHashCode;
|
|||||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||||
public class Line extends TextBoundingBox {
|
public class Line extends TextBoundingBox {
|
||||||
|
|
||||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.17;
|
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
|
||||||
|
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
private final double x0;
|
private final double x0;
|
||||||
@ -157,9 +157,6 @@ public class Line extends TextBoundingBox {
|
|||||||
|
|
||||||
private void computeWords(List<Character> characters, double wordSpacing) {
|
private void computeWords(List<Character> characters, double wordSpacing) {
|
||||||
|
|
||||||
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
|
|
||||||
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
|
|
||||||
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
|
|
||||||
Word word = new Word();
|
Word word = new Word();
|
||||||
Character previous = null;
|
Character previous = null;
|
||||||
for (Character current : characters) {
|
for (Character current : characters) {
|
||||||
|
|||||||
@ -18,14 +18,10 @@ public class ClassificationPatterns {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
public static final Pattern TABLE_OR_FIGURE_HEADLINE_PATTERN = Pattern.compile(
|
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
||||||
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||||
Pattern.CASE_INSENSITIVE);
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
public static final Pattern TABLE_MID_SENTENCE_PATTERN = Pattern.compile(
|
|
||||||
"(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
|
||||||
Pattern.CASE_INSENSITIVE);
|
|
||||||
|
|
||||||
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
||||||
|
|
||||||
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
||||||
|
|||||||
@ -6,8 +6,7 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.clas
|
|||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_MID_SENTENCE_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_HEADLINE_PATTERN;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
@ -84,8 +83,7 @@ public class DocuMineClassificationService {
|
|||||||
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
||||||
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_HEADLINE_PATTERN.matcher(textBlock.toString());
|
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher tableMidSentenceMatcher = TABLE_MID_SENTENCE_PATTERN.matcher(textBlock.toString());
|
|
||||||
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
||||||
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
||||||
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
||||||
@ -150,8 +148,6 @@ public class DocuMineClassificationService {
|
|||||||
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
||||||
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
|
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
|
||||||
&& tableMidSentenceMatcher.reset().results()
|
|
||||||
.count() <= 1 //
|
|
||||||
&& !isAmount//
|
&& !isAmount//
|
||||||
&& !headlineWithSlashesMatches) {
|
&& !headlineWithSlashesMatches) {
|
||||||
|
|
||||||
|
|||||||
@ -71,9 +71,6 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
|
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
|
||||||
|
|
||||||
if (start >= textBlocks.size()) {
|
|
||||||
return start;
|
|
||||||
}
|
|
||||||
ClassificationPage startPage = textBlocks.get(start).page();
|
ClassificationPage startPage = textBlocks.get(start).page();
|
||||||
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||||
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
|
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
|
||||||
|
|||||||
@ -417,15 +417,14 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
public void addVersionAndLayoutParsingType(String version, String layoutParsingType, Page page) {
|
public void addVersionAndLayoutParsingType(String version, String layoutParsingType, Page page) {
|
||||||
|
|
||||||
PageInformation pageInformation = PageInformation.fromPage(page);
|
PageInformation pageInformation = PageInformation.fromPage(page);
|
||||||
double startHeight = pageInformation.heightRot() - 5;
|
Point2D point1 = new Point2D.Double(0, pageInformation.height() - 5);
|
||||||
Point2D point1 = new Point2D.Double(0, startHeight);
|
Point2D point2 = new Point2D.Double(0, pageInformation.height() - 5 - FONT_SIZE * 1.5);
|
||||||
Point2D point2 = new Point2D.Double(0, startHeight - FONT_SIZE * 1.5);
|
|
||||||
AffineTransform affineTransform = CoordinateTransforms.calculatePageCoordsToInitialUserSpaceCoords(pageInformation);
|
AffineTransform affineTransform = CoordinateTransforms.calculatePageCoordsToInitialUserSpaceCoords(pageInformation);
|
||||||
affineTransform.transform(point1, point1);
|
Point2D transformed1 = affineTransform.transform(point1, point1);
|
||||||
affineTransform.transform(point2, point2);
|
Point2D transformed2 = affineTransform.transform(point2, point2);
|
||||||
getOrCreateVisualizationsOnPage(page.getNumber(), this.versionAndType).getPlacedTexts()
|
getOrCreateVisualizationsOnPage(page.getNumber(), this.versionAndType).getPlacedTexts()
|
||||||
.addAll(List.of(PlacedText.textFacingUp(String.valueOf(version), point1, FONT_SIZE, Color.BLACK, FONT),
|
.addAll(List.of(PlacedText.textFacingUp(version, transformed1, FONT_SIZE, Color.BLACK, FONT),
|
||||||
PlacedText.textFacingUp(String.valueOf(layoutParsingType), point2, FONT_SIZE, Color.BLACK, FONT)));
|
PlacedText.textFacingUp(layoutParsingType, transformed2, FONT_SIZE, Color.BLACK, FONT)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -39,7 +39,6 @@ dependencies {
|
|||||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
||||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||||
implementation("ch.qos.logback:logback-classic")
|
implementation("ch.qos.logback:logback-classic")
|
||||||
api("com.iqser.red.commons:metric-commons:2.3.0")
|
|
||||||
|
|
||||||
implementation("com.pdftron:PDFNet:10.11.0")
|
implementation("com.pdftron:PDFNet:10.11.0")
|
||||||
|
|
||||||
|
|||||||
@ -44,7 +44,7 @@ class ViewerDocVersioningUtilityTest {
|
|||||||
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||||
}
|
}
|
||||||
assert ViewerDocVersioningUtility.isCurrentVersion(tmpFile.toFile());
|
assert ViewerDocVersioningUtility.isCurrentVersion(tmpFile.toFile());
|
||||||
assert tmpFile.toFile().delete();
|
//assert tmpFile.toFile().delete();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user