RED-8825: general layoutparsing improvements

* added test for table line classification
This commit is contained in:
Kilian Schuettler 2024-05-03 00:13:48 +02:00
parent 60acbac53f
commit bcd1eb9afa
2 changed files with 84 additions and 46 deletions

View File

@ -0,0 +1,84 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collections;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import lombok.SneakyThrows;
public class RulingsClassifierTest {
@Test
@SneakyThrows
public void textRulingExtractionTest() {
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
assertTrue(pageContent.getSortedTextPositionSequences()
.stream()
.filter(word -> word.toString().equals("Underlined"))
.allMatch(TextPositionSequence::isUnderline));
assertTrue(pageContent.getSortedTextPositionSequences()
.stream()
.filter(word -> word.toString().equals("Striketrough"))
.allMatch(TextPositionSequence::isStrikethrough));
assertEquals(4,
cleanRulings.buildAll()
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH))
.count());
assertEquals(4,
cleanRulings.buildAll()
.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE))
.count());
assertEquals(0, cleanRulings.withoutTextRulings().buildAll().size());
}
}
@Test
@SneakyThrows
public void tableRulingExtractionTest() {
String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
assertEquals(30, cleanRulings.getHorizontals().size());
assertEquals(30, cleanRulings.getTableLines().getHorizontals().size());
assertEquals(144, cleanRulings.getVerticals().size());
assertEquals(144, cleanRulings.getTableLines().getVerticals().size());
}
}
}

View File

@ -1,46 +0,0 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collections;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import lombok.SneakyThrows;
public class TextRulingsClassifierTest {
@Test
@SneakyThrows
public void textRulingExtractionTest() {
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Underlined")).allMatch(TextPositionSequence::isUnderline));
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Striketrough")).allMatch(TextPositionSequence::isStrikethrough));
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH)).count());
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE)).count());
}
}
}