Merge branch 'RED-10270-bp' into 'release/0.159.x'
RED-10270: fix NumberFormatException See merge request fforesight/layout-parser!250
This commit is contained in:
commit
e01c0a8d3b
@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
|
|||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class SectionIdentifier {
|
public class SectionIdentifier {
|
||||||
|
|
||||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?");
|
||||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?");
|
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
||||||
|
|
||||||
public enum Format {
|
public enum Format {
|
||||||
EMPTY,
|
EMPTY,
|
||||||
|
|||||||
@ -14,11 +14,12 @@ import lombok.experimental.FieldDefaults;
|
|||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class ListIdentifier {
|
public class ListIdentifier {
|
||||||
|
|
||||||
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
|
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]{1,4})\\.\\s+");
|
||||||
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+");
|
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]{1,4})\\)\\s+");
|
||||||
|
|
||||||
enum Format {
|
enum Format {
|
||||||
NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES
|
NUMBER_WITH_DOT,
|
||||||
|
NUMBER_IN_PARENTHESES
|
||||||
}
|
}
|
||||||
|
|
||||||
Format format;
|
Format format;
|
||||||
@ -48,17 +49,33 @@ public class ListIdentifier {
|
|||||||
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
|
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
|
||||||
|
|
||||||
if (numberMatcher.find()) {
|
if (numberMatcher.find()) {
|
||||||
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
|
Optional<Integer> representation = parseInteger(numberMatcher.group(1));
|
||||||
|
if (representation.isPresent()) {
|
||||||
|
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, representation.get()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
|
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
|
||||||
if (parenthesisMatcher.find()) {
|
if (parenthesisMatcher.find()) {
|
||||||
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1))));
|
Optional<Integer> representation = parseInteger(parenthesisMatcher.group(1));
|
||||||
|
if (representation.isPresent()) {
|
||||||
|
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, representation.get()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Optional<Integer> parseInteger(String text) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
return Optional.of(Integer.parseInt(text));
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
|
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
|
||||||
|
|
||||||
if (listIdentifiers.size() <= 1) {
|
if (listIdentifiers.size() <= 1) {
|
||||||
|
|||||||
@ -170,7 +170,7 @@ public class TableOfContentsClassificationService {
|
|||||||
|
|
||||||
Word word = sequences.get(i);
|
Word word = sequences.get(i);
|
||||||
|
|
||||||
if (!NUMERIC.matcher(word).matches()) {
|
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user