Compare commits

..

10 Commits

Author SHA1 Message Date
Dominique Eifländer
ef23ee0ade Merge branch 'RED-10752-main' into 'main'
RED-10752: Enabled prometheus

See merge request fforesight/layout-parser!267
2025-01-29 13:34:01 +01:00
Dominique Eifländer
af31f52b47 RED-10752: Enabled prometheus 2025-01-29 11:09:29 +01:00
Kilian Schüttler
b5152112ee Merge branch 'RM-231' into 'main'
RM-231: missing whitespace in name

See merge request fforesight/layout-parser!264
2025-01-14 13:04:10 +01:00
Kilian Schuettler
85ea4ef455 RM-231: missing whitespace in name 2025-01-14 12:59:01 +01:00
Kilian Schüttler
01f8c01fff Merge branch 'RED-10714' into 'main'
RED-10714: fix IndexOutOfBoundsException

See merge request fforesight/layout-parser!262
2025-01-10 12:33:18 +01:00
Kilian Schuettler
0b6a292c75 RED-10714: fix IndexOutOfBoundsException 2025-01-10 12:12:14 +01:00
Maverick Studer
e24020589c Merge branch 'feature/RED-9998' into 'main'
RED-9998: App version history (for conditional re-analyzing the layout of a file)

See merge request fforesight/layout-parser!259
2024-12-12 09:58:46 +01:00
Maverick Studer
c619b845e8 RED-9998: App version history (for conditional re-analyzing the layout of a file) 2024-12-12 09:58:46 +01:00
Kilian Schüttler
ed0371ca11 Merge branch 'RED-10127' into 'main'
RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines

See merge request fforesight/layout-parser!257
2024-12-06 14:49:48 +01:00
Kilian Schuettler
89b5be8d67 RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines 2024-12-06 13:41:44 +01:00
5 changed files with 19 additions and 4 deletions

View File

@ -24,7 +24,7 @@ import lombok.EqualsAndHashCode;
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false) @EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class Line extends TextBoundingBox { public class Line extends TextBoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.18; private static final double WORD_DISTANCE_MULTIPLIER = 0.17;
@EqualsAndHashCode.Include @EqualsAndHashCode.Include
private final double x0; private final double x0;
@ -157,6 +157,9 @@ public class Line extends TextBoundingBox {
private void computeWords(List<Character> characters, double wordSpacing) { private void computeWords(List<Character> characters, double wordSpacing) {
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
Word word = new Word(); Word word = new Word();
Character previous = null; Character previous = null;
for (Character current : characters) { for (Character current : characters) {

View File

@ -18,10 +18,14 @@ public class ClassificationPatterns {
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile( public static final Pattern TABLE_OR_FIGURE_HEADLINE_PATTERN = Pattern.compile(
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b", "^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE); Pattern.CASE_INSENSITIVE);
public static final Pattern TABLE_MID_SENTENCE_PATTERN = Pattern.compile(
"(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
Pattern.CASE_INSENSITIVE);
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]"); public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
public static final Pattern NUMERIC = Pattern.compile("[0-9]+"); public static final Pattern NUMERIC = Pattern.compile("[0-9]+");

View File

@ -6,7 +6,8 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.clas
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_MID_SENTENCE_PATTERN;
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_HEADLINE_PATTERN;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
@ -83,7 +84,8 @@ public class DocuMineClassificationService {
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString()); Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString()); Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString()); Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString()); Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_HEADLINE_PATTERN.matcher(textBlock.toString());
Matcher tableMidSentenceMatcher = TABLE_MID_SENTENCE_PATTERN.matcher(textBlock.toString());
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString()); Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
boolean isAtLeast3Characters = atLeast3Matcher.reset().find(); boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches(); boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
@ -148,6 +150,8 @@ public class DocuMineClassificationService {
&& greaterOrEqualFontThanPageAverage(textBlock, page)// && greaterOrEqualFontThanPageAverage(textBlock, page)//
&& PositionUtils.getApproxLineCount(textBlock) < 2.9// && PositionUtils.getApproxLineCount(textBlock) < 2.9//
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) // && (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
&& tableMidSentenceMatcher.reset().results()
.count() <= 1 //
&& !isAmount// && !isAmount//
&& !headlineWithSlashesMatches) { && !headlineWithSlashesMatches) {

View File

@ -71,6 +71,9 @@ public class TableOfContentsClassificationService {
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) { private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
if (start >= textBlocks.size()) {
return start;
}
ClassificationPage startPage = textBlocks.get(start).page(); ClassificationPage startPage = textBlocks.get(start).page();
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size())); List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>(); HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();

View File

@ -39,6 +39,7 @@ dependencies {
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4") implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("net.logstash.logback:logstash-logback-encoder:7.4") implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("ch.qos.logback:logback-classic") implementation("ch.qos.logback:logback-classic")
api("com.iqser.red.commons:metric-commons:2.3.0")
implementation("com.pdftron:PDFNet:10.11.0") implementation("com.pdftron:PDFNet:10.11.0")