RED-8825: general layoutparsing improvements
* added test for table line classification
This commit is contained in:
parent
60acbac53f
commit
bcd1eb9afa
@ -0,0 +1,84 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
public class RulingsClassifierTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void textRulingExtractionTest() {
|
||||||
|
|
||||||
|
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||||
|
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
|
|
||||||
|
for (PageContents pageContent : pageContents) {
|
||||||
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||||
|
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||||
|
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||||
|
|
||||||
|
assertTrue(pageContent.getSortedTextPositionSequences()
|
||||||
|
.stream()
|
||||||
|
.filter(word -> word.toString().equals("Underlined"))
|
||||||
|
.allMatch(TextPositionSequence::isUnderline));
|
||||||
|
assertTrue(pageContent.getSortedTextPositionSequences()
|
||||||
|
.stream()
|
||||||
|
.filter(word -> word.toString().equals("Striketrough"))
|
||||||
|
.allMatch(TextPositionSequence::isStrikethrough));
|
||||||
|
|
||||||
|
assertEquals(4,
|
||||||
|
cleanRulings.buildAll()
|
||||||
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH))
|
||||||
|
.count());
|
||||||
|
assertEquals(4,
|
||||||
|
cleanRulings.buildAll()
|
||||||
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE))
|
||||||
|
.count());
|
||||||
|
assertEquals(0, cleanRulings.withoutTextRulings().buildAll().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void tableRulingExtractionTest() {
|
||||||
|
|
||||||
|
String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf";
|
||||||
|
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
|
|
||||||
|
for (PageContents pageContent : pageContents) {
|
||||||
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||||
|
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||||
|
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||||
|
|
||||||
|
assertEquals(30, cleanRulings.getHorizontals().size());
|
||||||
|
assertEquals(30, cleanRulings.getTableLines().getHorizontals().size());
|
||||||
|
|
||||||
|
assertEquals(144, cleanRulings.getVerticals().size());
|
||||||
|
assertEquals(144, cleanRulings.getTableLines().getVerticals().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,46 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
|
|
||||||
public class TextRulingsClassifierTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@SneakyThrows
|
|
||||||
public void textRulingExtractionTest() {
|
|
||||||
|
|
||||||
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
|
||||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
|
||||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
|
||||||
|
|
||||||
for (PageContents pageContent : pageContents) {
|
|
||||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
|
||||||
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
|
||||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
|
||||||
|
|
||||||
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Underlined")).allMatch(TextPositionSequence::isUnderline));
|
|
||||||
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Striketrough")).allMatch(TextPositionSequence::isStrikethrough));
|
|
||||||
|
|
||||||
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH)).count());
|
|
||||||
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE)).count());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
Loading…
x
Reference in New Issue
Block a user