RED-10127: fix typos and extend dot comparison
This commit is contained in:
parent
4b0c041d84
commit
e7dbda813a
@ -33,8 +33,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
private boolean underlined;
|
||||
|
||||
private double highestFontSize;
|
||||
|
||||
private PageBlockType classification;
|
||||
|
||||
private boolean toDuplicate;
|
||||
@ -262,6 +260,12 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public double getHighestFontSize() {
|
||||
|
||||
return frequencyCounters.getFontSizeFrequencyCounter().getHighest();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
|
||||
|
||||
@ -57,11 +57,13 @@ public class TableOfContentsClassificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
int offset = identifyTOCItems(i + 1, textBlocks, document);
|
||||
int end = identifyTOCItems(i + 1, textBlocks, document);
|
||||
|
||||
if (offset > 1) {
|
||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
||||
i += offset;
|
||||
if (end > i + 1) {
|
||||
if (textBlock.textBlock().getClassification() == null) {
|
||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
||||
}
|
||||
i = end;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -352,7 +354,7 @@ public class TableOfContentsClassificationService {
|
||||
return false;
|
||||
}
|
||||
|
||||
int prev = getNumberAsInt(numbers, i);
|
||||
int prev = getNumberAsInt(numbers, i - 1);
|
||||
int curr = getNumberAsInt(numbers, i);
|
||||
int next = getNumberAsInt(numbers, i + 1);
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
@ -50,6 +51,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
private final static Set<String> DOT_LIKE_CHARACTERS = Set.of(".", "·", "•", "․", "‧", "∙", "⋅", "・", ".", "・", "…", "⸱", "﹒", "ꞏ");
|
||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
private final List<Ruling> rulings = new ArrayList<>();
|
||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||
@ -336,20 +338,32 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
|
||||
|
||||
return i - startIndex >= 4 //
|
||||
&& textPositions.get(i).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
||||
&& !textPositions.get(i - 3).getUnicode().equals(".");
|
||||
&& isDot(textPositions, i) //
|
||||
&& isDot(textPositions, i - 1) //
|
||||
&& isDot(textPositions, i - 2) //
|
||||
&& alphanumeric(textPositions, i - 3);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
|
||||
|
||||
return i - startIndex >= 4 //
|
||||
&& !textPositions.get(i).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
||||
&& textPositions.get(i - 3).getUnicode().equals(".");
|
||||
&& alphanumeric(textPositions, i) //
|
||||
&& isDot(textPositions, i - 1) //
|
||||
&& isDot(textPositions, i - 2) //
|
||||
&& isDot(textPositions, i - 3);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isDot(List<TextPosition> textPositions, int i) {
|
||||
|
||||
return DOT_LIKE_CHARACTERS.contains(textPositions.get(i).getUnicode());
|
||||
}
|
||||
|
||||
|
||||
private static boolean alphanumeric(List<TextPosition> textPositions, int i) {
|
||||
|
||||
return Character.isAlphabetic(textPositions.get(i).getUnicode().charAt(0)) || Character.isDigit(textPositions.get(i).getUnicode().charAt(0));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Disabled
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/NER Dataset/Syngenta prod/77c680315c31d403d2e023be023b2087.PREVIEW.pdf";
|
||||
String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/613c12dd5c14851cb37e413eb56a7a7b.UNTOUCHED.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user