Compare commits

...

4 Commits

Author SHA1 Message Date
Kilian Schüttler
c3de36cdf4 Merge branch 'RED-10714-bp' into 'release/0.188.x'
RED-10714: fix IndexOutOfBoundsException

See merge request fforesight/layout-parser!261
2025-01-10 12:33:34 +01:00
Kilian Schuettler
6ce62f4ad1 RED-10714: fix IndexOutOfBoundsException 2025-01-10 12:11:46 +01:00
Kilian Schüttler
9297d6b83d Merge branch 'RED-10270' into 'release/0.188.x'
RED-10270: fix NumberFormatException

See merge request fforesight/layout-parser!249
2024-10-24 17:14:50 +02:00
Kilian Schuettler
230154a2a8 RED-10270: fix NumberFormatException 2024-10-24 10:59:30 +02:00
3 changed files with 28 additions and 8 deletions

View File

@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?");
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?");
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
public enum Format {
EMPTY,

View File

@ -14,11 +14,12 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ListIdentifier {
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+");
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]{1,4})\\.\\s+");
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]{1,4})\\)\\s+");
enum Format {
NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES
NUMBER_WITH_DOT,
NUMBER_IN_PARENTHESES
}
Format format;
@ -48,17 +49,33 @@ public class ListIdentifier {
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
if (numberMatcher.find()) {
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
Optional<Integer> representation = parseInteger(numberMatcher.group(1));
if (representation.isPresent()) {
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, representation.get()));
}
}
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
if (parenthesisMatcher.find()) {
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1))));
Optional<Integer> representation = parseInteger(parenthesisMatcher.group(1));
if (representation.isPresent()) {
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, representation.get()));
}
}
return Optional.empty();
}
private static Optional<Integer> parseInteger(String text) {
try {
return Optional.of(Integer.parseInt(text));
} catch (NumberFormatException e) {
return Optional.empty();
}
}
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
if (listIdentifiers.size() <= 1) {

View File

@ -69,6 +69,9 @@ public class TableOfContentsClassificationService {
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
if (start >= textBlocks.size()) {
return start;
}
ClassificationPage startPage = textBlocks.get(start).page();
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
@ -170,7 +173,7 @@ public class TableOfContentsClassificationService {
Word word = sequences.get(i);
if (!NUMERIC.matcher(word).matches()) {
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
continue;
}