RED-10127: improve list classification

* add one more format to list identification
* add 'ppb' to known units
* special case for headlines continuing with 14C after the identifier (quite often in some specific files)
This commit is contained in:
Kilian Schuettler 2024-10-14 17:21:44 +02:00
parent d3c4413ece
commit d8394d9a78
3 changed files with 12 additions and 8 deletions

View File

@ -4,7 +4,6 @@ import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -15,10 +14,11 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ListIdentifier {
public static final Pattern STARTING_NUMBERS = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+");
enum Format {
NUMBERS
NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES
}
Format format;
@ -45,10 +45,15 @@ public class ListIdentifier {
sb.replace(sb.length() - 1, sb.length(), "");
String text = sb.toString();
Matcher numberMatcher = STARTING_NUMBERS.matcher(text);
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
if (numberMatcher.find()) {
return Optional.of(new ListIdentifier(Format.NUMBERS, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
}
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
if (parenthesisMatcher.find()) {
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1))));
}
return Optional.empty();
}

View File

@ -7,7 +7,6 @@ import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.springframework.stereotype.Service;

View File

@ -4,7 +4,7 @@ import java.util.regex.Pattern;
public class ClassificationPatterns {
public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
public static final Pattern HEADLINE_WITH_2_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s(?:14C)?\\s*[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
public static final Pattern HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN = Pattern.compile("^([0-9]\\.)\\s[a-z][0-9a-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
@ -13,7 +13,7 @@ public class ClassificationPatterns {
public static final Pattern HEADLINE_PATTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
public static final Pattern AMOUNT_PATTERN = Pattern.compile(
"^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f)\\b",
"^\\s*\\d+(?:\\.\\d+)?\\s*(?:ml|l|ug|g|kg|mg|cm|cm2|cm3|mm|mm2|mm3|km|km2|m|m2|m3|lb|oz|ppm|dpm|days|weeks|months|%|f|ppb)\\b",
Pattern.CASE_INSENSITIVE);