Merge branch 'RED-10270-fp' into 'main'
RED-10270: fix NumberFormatException See merge request fforesight/layout-parser!248
This commit is contained in:
commit
b2d62e32fe
@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SectionIdentifier {
|
||||
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?");
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?");
|
||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
||||
|
||||
public enum Format {
|
||||
EMPTY,
|
||||
|
||||
@ -14,11 +14,12 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ListIdentifier {
|
||||
|
||||
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
|
||||
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+");
|
||||
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]{1,4})\\.\\s+");
|
||||
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]{1,4})\\)\\s+");
|
||||
|
||||
enum Format {
|
||||
NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES
|
||||
NUMBER_WITH_DOT,
|
||||
NUMBER_IN_PARENTHESES
|
||||
}
|
||||
|
||||
Format format;
|
||||
@ -48,17 +49,33 @@ public class ListIdentifier {
|
||||
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
|
||||
|
||||
if (numberMatcher.find()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
|
||||
Optional<Integer> representation = parseInteger(numberMatcher.group(1));
|
||||
if (representation.isPresent()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, representation.get()));
|
||||
}
|
||||
}
|
||||
|
||||
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
|
||||
if (parenthesisMatcher.find()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1))));
|
||||
Optional<Integer> representation = parseInteger(parenthesisMatcher.group(1));
|
||||
if (representation.isPresent()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, representation.get()));
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static Optional<Integer> parseInteger(String text) {
|
||||
|
||||
try {
|
||||
return Optional.of(Integer.parseInt(text));
|
||||
} catch (NumberFormatException e) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
|
||||
|
||||
if (listIdentifiers.size() <= 1) {
|
||||
|
||||
@ -170,7 +170,7 @@ public class TableOfContentsClassificationService {
|
||||
|
||||
Word word = sequences.get(i);
|
||||
|
||||
if (!NUMERIC.matcher(word).matches()) {
|
||||
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user