diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java index 0cf496b..0d55e5a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java @@ -4,7 +4,6 @@ import java.util.List; import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -15,10 +14,11 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ListIdentifier { - public static final Pattern STARTING_NUMBERS = Pattern.compile("^\\s*([1-9]+)\\.\\s+"); + public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+"); + public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+"); enum Format { - NUMBERS + NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES } Format format; @@ -45,10 +45,15 @@ public class ListIdentifier { sb.replace(sb.length() - 1, sb.length(), ""); String text = sb.toString(); - Matcher numberMatcher = STARTING_NUMBERS.matcher(text); + Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text); if (numberMatcher.find()) { - return Optional.of(new ListIdentifier(Format.NUMBERS, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1)))); + return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1)))); + } + + Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text); + if (parenthesisMatcher.find()) { + return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1)))); } return Optional.empty(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index d805c25..99ce865 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -7,7 +7,6 @@ import java.util.List; import java.util.ListIterator; import java.util.Locale; -import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.similarity.LevenshteinDistance; import org.springframework.stereotype.Service; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java index 737096e..7f27b47 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java @@ -4,7 +4,7 @@ import java.util.regex.Pattern; public class ClassificationPatterns { - public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); + public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s(?:14C)?\\s*[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); public static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); @@ -13,7 +13,7 @@ public class ClassificationPatterns { public static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); public static final Pattern AMOUNT_PATTERN = Pattern.compile( - "^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b", + "^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|ug|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f|ppb)\\b", Pattern.CASE_INSENSITIVE);