From 5ef5d5509bdafc72653f3dcea17b77bb8859a534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Thu, 24 Oct 2024 17:14:51 +0200 Subject: [PATCH] RED-10270: fix NumberFormatException --- .../processor/model/SectionIdentifier.java | 4 +-- .../processor/model/text/ListIdentifier.java | 27 +++++++++++++++---- .../TableOfContentsClassificationService.java | 2 +- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java index e6af0f8..9759695 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java @@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class SectionIdentifier { - public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); - public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?"); + public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?"); + public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?"); public enum Format { EMPTY, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java index e64a39e..7bd5b93 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ListIdentifier.java @@ -14,11 +14,12 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ListIdentifier { - public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+"); - public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+"); + public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]{1,4})\\.\\s+"); + public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]{1,4})\\)\\s+"); enum Format { - NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES + NUMBER_WITH_DOT, + NUMBER_IN_PARENTHESES } Format format; @@ -48,17 +49,33 @@ public class ListIdentifier { Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text); if (numberMatcher.find()) { - return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1)))); + Optional representation = parseInteger(numberMatcher.group(1)); + if (representation.isPresent()) { + return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, representation.get())); + } } Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text); if (parenthesisMatcher.find()) { - return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1)))); + Optional representation = parseInteger(parenthesisMatcher.group(1)); + if (representation.isPresent()) { + return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, representation.get())); + } } return Optional.empty(); } + private static Optional parseInteger(String text) { + + try { + return Optional.of(Integer.parseInt(text)); + } catch (NumberFormatException e) { + return Optional.empty(); + } + } + + public static boolean isInOrder(List listIdentifiers) { if (listIdentifiers.size() <= 1) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java index 2465632..c04f7d7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java @@ -170,7 +170,7 @@ public class TableOfContentsClassificationService { Word word = sequences.get(i); - if (!NUMERIC.matcher(word).matches()) { + if (!NUMERIC.matcher(word).matches() || word.length() > 5) { continue; }