RED-10127: fix typos and extend dot comparison
This commit is contained in:
parent
4b0c041d84
commit
e7dbda813a
@ -33,8 +33,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
private boolean underlined;
|
private boolean underlined;
|
||||||
|
|
||||||
private double highestFontSize;
|
|
||||||
|
|
||||||
private PageBlockType classification;
|
private PageBlockType classification;
|
||||||
|
|
||||||
private boolean toDuplicate;
|
private boolean toDuplicate;
|
||||||
@ -262,6 +260,12 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getHighestFontSize() {
|
||||||
|
|
||||||
|
return frequencyCounters.getFontSizeFrequencyCounter().getHighest();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
|
|
||||||
|
|||||||
@ -57,11 +57,13 @@ public class TableOfContentsClassificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int offset = identifyTOCItems(i + 1, textBlocks, document);
|
int end = identifyTOCItems(i + 1, textBlocks, document);
|
||||||
|
|
||||||
if (offset > 1) {
|
if (end > i + 1) {
|
||||||
textBlock.textBlock().setClassification(PageBlockType.H1);
|
if (textBlock.textBlock().getClassification() == null) {
|
||||||
i += offset;
|
textBlock.textBlock().setClassification(PageBlockType.H1);
|
||||||
|
}
|
||||||
|
i = end;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -352,7 +354,7 @@ public class TableOfContentsClassificationService {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int prev = getNumberAsInt(numbers, i);
|
int prev = getNumberAsInt(numbers, i - 1);
|
||||||
int curr = getNumberAsInt(numbers, i);
|
int curr = getNumberAsInt(numbers, i);
|
||||||
int next = getNumberAsInt(numbers, i + 1);
|
int next = getNumberAsInt(numbers, i + 1);
|
||||||
|
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import java.awt.geom.Point2D;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||||
@ -50,6 +51,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class PDFLinesTextStripper extends PDFTextStripper {
|
public class PDFLinesTextStripper extends PDFTextStripper {
|
||||||
|
|
||||||
|
private final static Set<String> DOT_LIKE_CHARACTERS = Set.of(".", "·", "•", "․", "‧", "∙", "⋅", "・", ".", "・", "…", "⸱", "﹒", "ꞏ");
|
||||||
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
private final List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||||
private final List<Ruling> rulings = new ArrayList<>();
|
private final List<Ruling> rulings = new ArrayList<>();
|
||||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||||
@ -336,20 +338,32 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
|
private boolean isWordFollowedByDottedLine(List<TextPosition> textPositions, int i, int startIndex) {
|
||||||
|
|
||||||
return i - startIndex >= 4 //
|
return i - startIndex >= 4 //
|
||||||
&& textPositions.get(i).getUnicode().equals(".") //
|
&& isDot(textPositions, i) //
|
||||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
&& isDot(textPositions, i - 1) //
|
||||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
&& isDot(textPositions, i - 2) //
|
||||||
&& !textPositions.get(i - 3).getUnicode().equals(".");
|
&& alphanumeric(textPositions, i - 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
|
private static boolean isDottedLineFollowedByWord(List<TextPosition> textPositions, int i, int startIndex) {
|
||||||
|
|
||||||
return i - startIndex >= 4 //
|
return i - startIndex >= 4 //
|
||||||
&& !textPositions.get(i).getUnicode().equals(".") //
|
&& alphanumeric(textPositions, i) //
|
||||||
&& textPositions.get(i - 1).getUnicode().equals(".") //
|
&& isDot(textPositions, i - 1) //
|
||||||
&& textPositions.get(i - 2).getUnicode().equals(".") //
|
&& isDot(textPositions, i - 2) //
|
||||||
&& textPositions.get(i - 3).getUnicode().equals(".");
|
&& isDot(textPositions, i - 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isDot(List<TextPosition> textPositions, int i) {
|
||||||
|
|
||||||
|
return DOT_LIKE_CHARACTERS.contains(textPositions.get(i).getUnicode());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean alphanumeric(List<TextPosition> textPositions, int i) {
|
||||||
|
|
||||||
|
return Character.isAlphabetic(textPositions.get(i).getUnicode().charAt(0)) || Character.isDigit(textPositions.get(i).getUnicode().charAt(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@Disabled
|
@Disabled
|
||||||
public void testLayoutParserEndToEnd() {
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/NER Dataset/Syngenta prod/77c680315c31d403d2e023be023b2087.PREVIEW.pdf";
|
String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/613c12dd5c14851cb37e413eb56a7a7b.UNTOUCHED.pdf";
|
||||||
|
|
||||||
runForFile(filePath);
|
runForFile(filePath);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user