Merge branch 'feature/RED-10127-bp' into 'release/0.159.x'

RED-10127: improve list classification

See merge request fforesight/layout-parser!239
This commit is contained in:
Kilian Schüttler 2024-10-14 17:31:24 +02:00
commit cee6c74d73
3 changed files with 12 additions and 8 deletions

View File

@ -4,7 +4,6 @@ import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
@ -15,10 +14,11 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ListIdentifier { public class ListIdentifier {
public static final Pattern STARTING_NUMBERS = Pattern.compile("^\\s*([1-9]+)\\.\\s+"); public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+");
enum Format { enum Format {
NUMBERS NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES
} }
Format format; Format format;
@ -45,10 +45,15 @@ public class ListIdentifier {
sb.replace(sb.length() - 1, sb.length(), ""); sb.replace(sb.length() - 1, sb.length(), "");
String text = sb.toString(); String text = sb.toString();
Matcher numberMatcher = STARTING_NUMBERS.matcher(text); Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
if (numberMatcher.find()) { if (numberMatcher.find()) {
return Optional.of(new ListIdentifier(Format.NUMBERS, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1)))); return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
}
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
if (parenthesisMatcher.find()) {
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1))));
} }
return Optional.empty(); return Optional.empty();
} }

View File

@ -7,7 +7,6 @@ import java.util.List;
import java.util.ListIterator; import java.util.ListIterator;
import java.util.Locale; import java.util.Locale;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.LevenshteinDistance; import org.apache.commons.text.similarity.LevenshteinDistance;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;

View File

@ -4,7 +4,7 @@ import java.util.regex.Pattern;
public class ClassificationPatterns { public class ClassificationPatterns {
public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s(?:14C)?\\s*[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
public static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); public static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
@ -13,7 +13,7 @@ public class ClassificationPatterns {
public static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); public static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
public static final Pattern AMOUNT_PATTERN = Pattern.compile( public static final Pattern AMOUNT_PATTERN = Pattern.compile(
"^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b", "^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|ug|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f|ppb)\\b",
Pattern.CASE_INSENSITIVE); Pattern.CASE_INSENSITIVE);