Compare commits

..

2 Commits

9 changed files with 35 additions and 52 deletions

View File

@ -17,4 +17,6 @@ public class LayoutParserSettings {
boolean debug; boolean debug;
LayoutParsingType layoutParsingTypeOverride; LayoutParsingType layoutParsingTypeOverride;
String pdftronLicense;
String layoutParserVersion;
} }

View File

@ -20,7 +20,6 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper; import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
@ -88,32 +87,29 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class LayoutParsingPipeline { public class LayoutParsingPipeline {
final ImageServiceResponseAdapter imageServiceResponseAdapter; ImageServiceResponseAdapter imageServiceResponseAdapter;
final CvTableParsingAdapter cvTableParsingAdapter; CvTableParsingAdapter cvTableParsingAdapter;
final LayoutParsingStorageService layoutParsingStorageService; LayoutParsingStorageService layoutParsingStorageService;
final SectionsBuilderService sectionsBuilderService; SectionsBuilderService sectionsBuilderService;
final SimplifiedSectionTextService simplifiedSectionTextService; SimplifiedSectionTextService simplifiedSectionTextService;
final RulingCleaningService rulingCleaningService; RulingCleaningService rulingCleaningService;
final TableExtractionService tableExtractionService; TableExtractionService tableExtractionService;
final DocuMineBlockificationService docuMineBlockificationService; DocuMineBlockificationService docuMineBlockificationService;
final RedactManagerBlockificationService redactManagerBlockificationService; RedactManagerBlockificationService redactManagerBlockificationService;
final BlockificationPostprocessingService blockificationPostprocessingService; BlockificationPostprocessingService blockificationPostprocessingService;
final DocstrumBlockificationService docstrumBlockificationService; DocstrumBlockificationService docstrumBlockificationService;
final LayoutGridService layoutGridService; LayoutGridService layoutGridService;
final ObservationRegistry observationRegistry; ObservationRegistry observationRegistry;
final VisualLayoutParsingAdapter visualLayoutParsingAdapter; VisualLayoutParsingAdapter visualLayoutParsingAdapter;
final GraphicExtractorService graphicExtractorService; GraphicExtractorService graphicExtractorService;
final OutlineExtractorService outlineExtractorService; OutlineExtractorService outlineExtractorService;
final SectionTreeBuilderService sectionTreeBuilderService; SectionTreeBuilderService sectionTreeBuilderService;
final SectionTreeEnhancementService sectionTreeEnhancementService; SectionTreeEnhancementService sectionTreeEnhancementService;
final LayoutParserSettings settings; LayoutParserSettings settings;
final ClassificationService classificationService; ClassificationService classificationService;
@Value("${LAYOUT_PARSER_VERSION:}")
private String layoutParserVersion;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -137,6 +133,7 @@ public class LayoutParsingPipeline {
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null // LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(); ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
String layoutParserVersion = settings.getLayoutParserVersion();
ClassificationDocument classificationDocument = parseLayout(layoutParsingType, ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
originFile, originFile,

View File

@ -24,7 +24,7 @@ import lombok.EqualsAndHashCode;
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false) @EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class Line extends TextBoundingBox { public class Line extends TextBoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.17; private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
private final double x0; private final double x0;
@ -157,9 +157,6 @@ public class Line extends TextBoundingBox {
private void computeWords(List<Character> characters, double wordSpacing) { private void computeWords(List<Character> characters, double wordSpacing) {
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
Word word = new Word(); Word word = new Word();
Character previous = null; Character previous = null;
for (Character current : characters) { for (Character current : characters) {

View File

@ -18,14 +18,10 @@ public class ClassificationPatterns {
public static final Pattern TABLE_OR_FIGURE_HEADLINE_PATTERN = Pattern.compile( public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b", "^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE); Pattern.CASE_INSENSITIVE);
public static final Pattern TABLE_MID_SENTENCE_PATTERN = Pattern.compile(
"(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE);
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]"); public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
public static final Pattern NUMERIC = Pattern.compile("[0-9]+"); public static final Pattern NUMERIC = Pattern.compile("[0-9]+");

View File

@ -6,8 +6,7 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.clas
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_MID_SENTENCE_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_HEADLINE_PATTERN;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
@ -84,8 +83,7 @@ public class DocuMineClassificationService {
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString()); Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString()); Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString()); Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_HEADLINE_PATTERN.matcher(textBlock.toString()); Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString());
Matcher tableMidSentenceMatcher = TABLE_MID_SENTENCE_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString()); Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
boolean isAtLeast3Characters = atLeast3Matcher.reset().find(); boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches(); boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
@ -150,8 +148,6 @@ public class DocuMineClassificationService {
&& greaterOrEqualFontThanPageAverage(textBlock, page)// && greaterOrEqualFontThanPageAverage(textBlock, page)//
&& PositionUtils.getApproxLineCount(textBlock) < 2.9// && PositionUtils.getApproxLineCount(textBlock) < 2.9//
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) // && (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
&& tableMidSentenceMatcher.reset().results()
.count() <= 1 //
&& !isAmount// && !isAmount//
&& !headlineWithSlashesMatches) { && !headlineWithSlashesMatches) {

View File

@ -71,9 +71,6 @@ public class TableOfContentsClassificationService {
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) { private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
if (start >= textBlocks.size()) {
return start;
}
ClassificationPage startPage = textBlocks.get(start).page(); ClassificationPage startPage = textBlocks.get(start).page();
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size())); List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>(); HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();

View File

@ -417,15 +417,14 @@ public class LayoutGrid extends LayoutGridLayerConfig {
public void addVersionAndLayoutParsingType(String version, String layoutParsingType, Page page) { public void addVersionAndLayoutParsingType(String version, String layoutParsingType, Page page) {
PageInformation pageInformation = PageInformation.fromPage(page); PageInformation pageInformation = PageInformation.fromPage(page);
double startHeight = pageInformation.heightRot() - 5; Point2D point1 = new Point2D.Double(0, pageInformation.height() - 5);
Point2D point1 = new Point2D.Double(0, startHeight); Point2D point2 = new Point2D.Double(0, pageInformation.height() - 5 - FONT_SIZE * 1.5);
Point2D point2 = new Point2D.Double(0, startHeight - FONT_SIZE * 1.5);
AffineTransform affineTransform = CoordinateTransforms.calculatePageCoordsToInitialUserSpaceCoords(pageInformation); AffineTransform affineTransform = CoordinateTransforms.calculatePageCoordsToInitialUserSpaceCoords(pageInformation);
affineTransform.transform(point1, point1); Point2D transformed1 = affineTransform.transform(point1, point1);
affineTransform.transform(point2, point2); Point2D transformed2 = affineTransform.transform(point2, point2);
getOrCreateVisualizationsOnPage(page.getNumber(), this.versionAndType).getPlacedTexts() getOrCreateVisualizationsOnPage(page.getNumber(), this.versionAndType).getPlacedTexts()
.addAll(List.of(PlacedText.textFacingUp(String.valueOf(version), point1, FONT_SIZE, Color.BLACK, FONT), .addAll(List.of(PlacedText.textFacingUp(version, transformed1, FONT_SIZE, Color.BLACK, FONT),
PlacedText.textFacingUp(String.valueOf(layoutParsingType), point2, FONT_SIZE, Color.BLACK, FONT))); PlacedText.textFacingUp(layoutParsingType, transformed2, FONT_SIZE, Color.BLACK, FONT)));
} }

View File

@ -39,7 +39,6 @@ dependencies {
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4") implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("net.logstash.logback:logstash-logback-encoder:7.4") implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("ch.qos.logback:logback-classic") implementation("ch.qos.logback:logback-classic")
api("com.iqser.red.commons:metric-commons:2.3.0")
implementation("com.pdftron:PDFNet:10.11.0") implementation("com.pdftron:PDFNet:10.11.0")

View File

@ -44,7 +44,7 @@ class ViewerDocVersioningUtilityTest {
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null); doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} }
assert ViewerDocVersioningUtility.isCurrentVersion(tmpFile.toFile()); assert ViewerDocVersioningUtility.isCurrentVersion(tmpFile.toFile());
assert tmpFile.toFile().delete(); //assert tmpFile.toFile().delete();
} }
} }