Compare commits
10 Commits
0.194.0-RE
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef23ee0ade | ||
|
|
af31f52b47 | ||
|
|
b5152112ee | ||
|
|
85ea4ef455 | ||
|
|
01f8c01fff | ||
|
|
0b6a292c75 | ||
|
|
e24020589c | ||
|
|
c619b845e8 | ||
|
|
ed0371ca11 | ||
|
|
89b5be8d67 |
@ -24,7 +24,7 @@ import lombok.EqualsAndHashCode;
|
|||||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||||
public class Line extends TextBoundingBox {
|
public class Line extends TextBoundingBox {
|
||||||
|
|
||||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
|
private static final double WORD_DISTANCE_MULTIPLIER = 0.17;
|
||||||
|
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
private final double x0;
|
private final double x0;
|
||||||
@ -157,6 +157,9 @@ public class Line extends TextBoundingBox {
|
|||||||
|
|
||||||
private void computeWords(List<Character> characters, double wordSpacing) {
|
private void computeWords(List<Character> characters, double wordSpacing) {
|
||||||
|
|
||||||
|
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
|
||||||
|
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
|
||||||
|
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
|
||||||
Word word = new Word();
|
Word word = new Word();
|
||||||
Character previous = null;
|
Character previous = null;
|
||||||
for (Character current : characters) {
|
for (Character current : characters) {
|
||||||
|
|||||||
@ -18,10 +18,14 @@ public class ClassificationPatterns {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
public static final Pattern TABLE_OR_FIGURE_HEADLINE_PATTERN = Pattern.compile(
|
||||||
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||||
Pattern.CASE_INSENSITIVE);
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
public static final Pattern TABLE_MID_SENTENCE_PATTERN = Pattern.compile(
|
||||||
|
"(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||||
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
||||||
|
|
||||||
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
||||||
|
|||||||
@ -6,7 +6,8 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.clas
|
|||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_MID_SENTENCE_PATTERN;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_HEADLINE_PATTERN;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
@ -83,7 +84,8 @@ public class DocuMineClassificationService {
|
|||||||
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
||||||
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString());
|
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_HEADLINE_PATTERN.matcher(textBlock.toString());
|
||||||
|
Matcher tableMidSentenceMatcher = TABLE_MID_SENTENCE_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
||||||
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
||||||
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
||||||
@ -148,6 +150,8 @@ public class DocuMineClassificationService {
|
|||||||
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
||||||
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
|
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
|
||||||
|
&& tableMidSentenceMatcher.reset().results()
|
||||||
|
.count() <= 1 //
|
||||||
&& !isAmount//
|
&& !isAmount//
|
||||||
&& !headlineWithSlashesMatches) {
|
&& !headlineWithSlashesMatches) {
|
||||||
|
|
||||||
|
|||||||
@ -71,6 +71,9 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
|
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
|
||||||
|
|
||||||
|
if (start >= textBlocks.size()) {
|
||||||
|
return start;
|
||||||
|
}
|
||||||
ClassificationPage startPage = textBlocks.get(start).page();
|
ClassificationPage startPage = textBlocks.get(start).page();
|
||||||
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||||
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
|
HashMap<NumberWord, TextBlockOnPage> numberToBlockLookup = new HashMap<>();
|
||||||
|
|||||||
@ -39,6 +39,7 @@ dependencies {
|
|||||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
||||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||||
implementation("ch.qos.logback:logback-classic")
|
implementation("ch.qos.logback:logback-classic")
|
||||||
|
api("com.iqser.red.commons:metric-commons:2.3.0")
|
||||||
|
|
||||||
implementation("com.pdftron:PDFNet:10.11.0")
|
implementation("com.pdftron:PDFNet:10.11.0")
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user