RED-8825: general layoutparsing improvements
* added test for table line classification
This commit is contained in:
parent
60acbac53f
commit
bcd1eb9afa
@ -0,0 +1,84 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class RulingsClassifierTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void textRulingExtractionTest() {
|
||||
|
||||
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||
|
||||
assertTrue(pageContent.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.filter(word -> word.toString().equals("Underlined"))
|
||||
.allMatch(TextPositionSequence::isUnderline));
|
||||
assertTrue(pageContent.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.filter(word -> word.toString().equals("Striketrough"))
|
||||
.allMatch(TextPositionSequence::isStrikethrough));
|
||||
|
||||
assertEquals(4,
|
||||
cleanRulings.buildAll()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH))
|
||||
.count());
|
||||
assertEquals(4,
|
||||
cleanRulings.buildAll()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE))
|
||||
.count());
|
||||
assertEquals(0, cleanRulings.withoutTextRulings().buildAll().size());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void tableRulingExtractionTest() {
|
||||
|
||||
String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||
|
||||
assertEquals(30, cleanRulings.getHorizontals().size());
|
||||
assertEquals(30, cleanRulings.getTableLines().getHorizontals().size());
|
||||
|
||||
assertEquals(144, cleanRulings.getVerticals().size());
|
||||
assertEquals(144, cleanRulings.getTableLines().getVerticals().size());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,46 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class TextRulingsClassifierTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void textRulingExtractionTest() {
|
||||
|
||||
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||
|
||||
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Underlined")).allMatch(TextPositionSequence::isUnderline));
|
||||
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Striketrough")).allMatch(TextPositionSequence::isStrikethrough));
|
||||
|
||||
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH)).count());
|
||||
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE)).count());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user