Compare commits
4 Commits
main
...
release/0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c3de36cdf4 | ||
|
|
6ce62f4ad1 | ||
|
|
9297d6b83d | ||
|
|
230154a2a8 |
@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SectionIdentifier {
|
||||
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?");
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?");
|
||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
||||
|
||||
public enum Format {
|
||||
EMPTY,
|
||||
|
||||
@ -14,11 +14,12 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ListIdentifier {
|
||||
|
||||
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]+)\\.\\s+");
|
||||
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]+)\\)\\s+");
|
||||
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]{1,4})\\.\\s+");
|
||||
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]{1,4})\\)\\s+");
|
||||
|
||||
enum Format {
|
||||
NUMBER_WITH_DOT, NUMBER_IN_PARENTHESES
|
||||
NUMBER_WITH_DOT,
|
||||
NUMBER_IN_PARENTHESES
|
||||
}
|
||||
|
||||
Format format;
|
||||
@ -48,17 +49,33 @@ public class ListIdentifier {
|
||||
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
|
||||
|
||||
if (numberMatcher.find()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, Integer.parseInt(numberMatcher.group(1))));
|
||||
Optional<Integer> representation = parseInteger(numberMatcher.group(1));
|
||||
if (representation.isPresent()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, representation.get()));
|
||||
}
|
||||
}
|
||||
|
||||
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
|
||||
if (parenthesisMatcher.find()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, Integer.parseInt(parenthesisMatcher.group(1))));
|
||||
Optional<Integer> representation = parseInteger(parenthesisMatcher.group(1));
|
||||
if (representation.isPresent()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, representation.get()));
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static Optional<Integer> parseInteger(String text) {
|
||||
|
||||
try {
|
||||
return Optional.of(Integer.parseInt(text));
|
||||
} catch (NumberFormatException e) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
|
||||
|
||||
if (listIdentifiers.size() <= 1) {
|
||||
|
||||
@ -69,6 +69,9 @@ public class TableOfContentsClassificationService {
|
||||
|
||||
private int identifyTOCItems(int start, List<TextBlockOnPage> textBlocks, ClassificationDocument document) {
|
||||
|
||||
if (start >= textBlocks.size()) {
|
||||
return start;
|
||||
}
|
||||
ClassificationPage startPage = textBlocks.get(start).page();
|
||||
List<TextBlockOnPage> initialLookAhead = textBlocks.subList(start, Math.min(start + SURROUNDING_BLOCKS_RADIUS, textBlocks.size()));
|
||||
HashMap<Word, TextBlockOnPage> lookup = new HashMap<>();
|
||||
@ -170,7 +173,7 @@ public class TableOfContentsClassificationService {
|
||||
|
||||
Word word = sequences.get(i);
|
||||
|
||||
if (!NUMERIC.matcher(word).matches()) {
|
||||
if (!NUMERIC.matcher(word).matches() || word.length() > 5) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user