Merge branch 'RED-10270-fp' into 'main'

RED-10270: fix NumberFormatException

See merge request fforesight/layout-parser!248
This commit is contained in:
Kilian Schüttler 2024-10-24 17:14:47 +02:00
commit b2d62e32fe
3 changed files with 25 additions and 8 deletions

View File

@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?");
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?");
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
public enum Format {
EMPTY,

View File

@ -14,11 +14,12 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ListIdentifier {
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+");
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]{1,4})\\.\\s+");
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]{1,4})\\)\\s+");
enum Format {
NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES
NUMBER_WITH_DOT,
NUMBER_IN_PARENTHESES
}
Format format;
@ -48,17 +49,33 @@ public class ListIdentifier {
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
if (numberMatcher.find()) {
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
Optional<Integer> representation = parseInteger(numberMatcher.group(1));
if (representation.isPresent()) {
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, representation.get()));
}
}
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
if (parenthesisMatcher.find()) {
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1))));
Optional<Integer> representation = parseInteger(parenthesisMatcher.group(1));
if (representation.isPresent()) {
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, representation.get()));
}
}
return Optional.empty();
}
private static Optional<Integer> parseInteger(String text) {
try {
return Optional.of(Integer.parseInt(text));
} catch (NumberFormatException e) {
return Optional.empty();
}
}
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
if (listIdentifiers.size() <= 1) {

View File

@ -170,7 +170,7 @@ public class TableOfContentsClassificationService {
Word word = sequences.get(i);
if (!NUMERIC.matcher(word).matches()) {
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
continue;
}