Merge branch 'feature/RED-10127-bp' into 'release/0.159.x'
RED-10127: improve list classification See merge request fforesight/layout-parser!239
This commit is contained in:
commit
cee6c74d73
@ -4,7 +4,6 @@ import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -15,10 +14,11 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ListIdentifier {
|
||||
|
||||
public static final Pattern STARTING_NUMBERS = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
|
||||
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
|
||||
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+");
|
||||
|
||||
enum Format {
|
||||
NUMBERS
|
||||
NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES
|
||||
}
|
||||
|
||||
Format format;
|
||||
@ -45,10 +45,15 @@ public class ListIdentifier {
|
||||
sb.replace(sb.length() - 1, sb.length(), "");
|
||||
String text = sb.toString();
|
||||
|
||||
Matcher numberMatcher = STARTING_NUMBERS.matcher(text);
|
||||
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
|
||||
|
||||
if (numberMatcher.find()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBERS, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
|
||||
}
|
||||
|
||||
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
|
||||
if (parenthesisMatcher.find()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1))));
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
@ -7,7 +7,6 @@ import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.text.similarity.LevenshteinDistance;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class ClassificationPatterns {
|
||||
|
||||
public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s(?:14C)?\\s*[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
@ -13,7 +13,7 @@ public class ClassificationPatterns {
|
||||
public static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
|
||||
public static final Pattern AMOUNT_PATTERN = Pattern.compile(
|
||||
"^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
|
||||
"^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|ug|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f|ppb)\\b",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user