RED-10127: add list classification
* refactor headline font sizes * remove title case * no real drawbacks, mostly edge cases * +0.1% F1 Score (2 Files with +8%)
This commit is contained in:
parent
28d79902ff
commit
e9b406af16
@ -374,14 +374,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||||
|
|
||||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
TableOfContents tableOfContents = outlineValidationService.createToC(classificationDocument);
|
||||||
.stream()
|
|
||||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
|
||||||
.stream()
|
|
||||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
|
||||||
.map(tb -> (TextPageBlock) tb))
|
|
||||||
.toList();
|
|
||||||
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
|
|
||||||
classificationDocument.setTableOfContents(tableOfContents);
|
classificationDocument.setTableOfContents(tableOfContents);
|
||||||
|
|
||||||
log.info("Building Sections for {}", identifier);
|
log.info("Building Sections for {}", identifier);
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import java.util.TreeSet;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
import io.micrometer.observation.annotation.Observed;
|
import io.micrometer.observation.annotation.Observed;
|
||||||
@ -20,7 +21,9 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
public class OutlineValidationService {
|
public class OutlineValidationService {
|
||||||
|
|
||||||
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
||||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
public TableOfContents createToC(ClassificationDocument classificationDocument) {
|
||||||
|
|
||||||
|
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
|
||||||
|
|
||||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||||
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
|
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
|
||||||
@ -60,4 +63,16 @@ public class OutlineValidationService {
|
|||||||
return new TableOfContents(mainSections);
|
return new TableOfContents(mainSections);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
|
||||||
|
|
||||||
|
return classificationDocument.getPages()
|
||||||
|
.stream()
|
||||||
|
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||||
|
.map(tb -> (TextPageBlock) tb))
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -62,74 +62,6 @@ public class DocuMineClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Double> buildHeadlineFontSizes(ClassificationDocument document) {
|
|
||||||
|
|
||||||
if (document.getFontSizeCounter().getCountPerValue().size() <= 6) {
|
|
||||||
return document.getFontSizeCounter().getValuesInReverseOrder();
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Map.Entry<Double, Integer>> sortedEntries = new ArrayList<>(document.getFontSizeCounter().getCountPerValue().entrySet());
|
|
||||||
sortedEntries.sort(Map.Entry.comparingByKey());
|
|
||||||
|
|
||||||
int totalCount = sortedEntries.stream()
|
|
||||||
.mapToInt(Map.Entry::getValue).sum();
|
|
||||||
|
|
||||||
int cumulativeCount = 0;
|
|
||||||
Iterator<Map.Entry<Double, Integer>> iterator = sortedEntries.iterator();
|
|
||||||
while (iterator.hasNext()) {
|
|
||||||
Map.Entry<Double, Integer> entry = iterator.next();
|
|
||||||
cumulativeCount += entry.getValue();
|
|
||||||
if (cumulativeCount > totalCount * 0.3) {
|
|
||||||
break; // We've filtered the bottom 30%, so stop.
|
|
||||||
}
|
|
||||||
iterator.remove();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sortedEntries.size() < 6) {
|
|
||||||
return document.getFontSizeCounter().getValuesInReverseOrder();
|
|
||||||
}
|
|
||||||
int clusterSize = Math.max(1, sortedEntries.size() / 6);
|
|
||||||
|
|
||||||
List<List<Double>> clusters = new ArrayList<>();
|
|
||||||
for (int i = 0; i < 6; i++) {
|
|
||||||
clusters.add(new ArrayList<>());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < sortedEntries.size(); i++) {
|
|
||||||
int clusterIndex = Math.min(i / clusterSize, 5);
|
|
||||||
clusters.get(clusterIndex).add(sortedEntries.get(i).getKey());
|
|
||||||
}
|
|
||||||
|
|
||||||
return clusters.stream()
|
|
||||||
.map(cluster -> cluster.stream()
|
|
||||||
.mapToDouble(d -> d).average()
|
|
||||||
.orElseThrow())
|
|
||||||
.sorted(Comparator.reverseOrder())
|
|
||||||
.toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<AbstractPageBlock> getSurroundingBlocksOnPage(int originalIndex, List<AbstractBlockOnPage> textBlocks) {
|
|
||||||
|
|
||||||
int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0);
|
|
||||||
int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size());
|
|
||||||
List<AbstractPageBlock> surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS);
|
|
||||||
for (int i = start; i < end; i++) {
|
|
||||||
if (i == originalIndex) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (textBlocks.get(i).block().getText().length() <= 1) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (textBlocks.get(i).page() != textBlocks.get(originalIndex).page()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
surroundingBlocks.add(textBlocks.get(i).block());
|
|
||||||
}
|
|
||||||
return surroundingBlocks;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void classifyBlock(HeadlineClassificationService headlineClassificationService,
|
private void classifyBlock(HeadlineClassificationService headlineClassificationService,
|
||||||
int currentIndex,
|
int currentIndex,
|
||||||
List<AbstractBlockOnPage> allBlocks,
|
List<AbstractBlockOnPage> allBlocks,
|
||||||
@ -331,6 +263,74 @@ public class DocuMineClassificationService {
|
|||||||
return blocks;
|
return blocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<Double> buildHeadlineFontSizes(ClassificationDocument document) {
|
||||||
|
|
||||||
|
if (document.getFontSizeCounter().getCountPerValue().size() <= 6) {
|
||||||
|
return document.getFontSizeCounter().getValuesInReverseOrder();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Map.Entry<Double, Integer>> sortedEntries = new ArrayList<>(document.getFontSizeCounter().getCountPerValue().entrySet());
|
||||||
|
sortedEntries.sort(Map.Entry.comparingByKey());
|
||||||
|
|
||||||
|
int totalCount = sortedEntries.stream()
|
||||||
|
.mapToInt(Map.Entry::getValue).sum();
|
||||||
|
|
||||||
|
int cumulativeCount = 0;
|
||||||
|
Iterator<Map.Entry<Double, Integer>> iterator = sortedEntries.iterator();
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
Map.Entry<Double, Integer> entry = iterator.next();
|
||||||
|
cumulativeCount += entry.getValue();
|
||||||
|
if (cumulativeCount > totalCount * 0.3) {
|
||||||
|
break; // We've filtered the bottom 30%, so stop.
|
||||||
|
}
|
||||||
|
iterator.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sortedEntries.size() < 6) {
|
||||||
|
return document.getFontSizeCounter().getValuesInReverseOrder();
|
||||||
|
}
|
||||||
|
int clusterSize = Math.max(1, sortedEntries.size() / 6);
|
||||||
|
|
||||||
|
List<List<Double>> clusters = new ArrayList<>();
|
||||||
|
for (int i = 0; i < 6; i++) {
|
||||||
|
clusters.add(new ArrayList<>());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < sortedEntries.size(); i++) {
|
||||||
|
int clusterIndex = Math.min(i / clusterSize, 5);
|
||||||
|
clusters.get(clusterIndex).add(sortedEntries.get(i).getKey());
|
||||||
|
}
|
||||||
|
|
||||||
|
return clusters.stream()
|
||||||
|
.map(cluster -> cluster.stream()
|
||||||
|
.mapToDouble(d -> d).average()
|
||||||
|
.orElseThrow())
|
||||||
|
.sorted(Comparator.reverseOrder())
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<AbstractPageBlock> getSurroundingBlocksOnPage(int originalIndex, List<AbstractBlockOnPage> textBlocks) {
|
||||||
|
|
||||||
|
int start = Math.max(originalIndex - SURROUNDING_BLOCKS_RADIUS, 0);
|
||||||
|
int end = Math.min(originalIndex + SURROUNDING_BLOCKS_RADIUS, textBlocks.size());
|
||||||
|
List<AbstractPageBlock> surroundingBlocks = new ArrayList<>(2 * SURROUNDING_BLOCKS_RADIUS);
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
if (i == originalIndex) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (textBlocks.get(i).block().getText().length() <= 1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!textBlocks.get(i).page().equals(textBlocks.get(originalIndex).page())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
surroundingBlocks.add(textBlocks.get(i).block());
|
||||||
|
}
|
||||||
|
return surroundingBlocks;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -79,7 +79,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.DOCUMINE_OLD);
|
||||||
Document document = buildGraph(fileName, classificationDocument);
|
Document document = buildGraph(fileName, classificationDocument);
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
||||||
@ -102,7 +102,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
|
|
||||||
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
|
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
|
||||||
|
|
||||||
assertEquals(tableOfContents.getMainSections().size(), 10);
|
assertEquals(tableOfContents.getMainSections().size(), 9);
|
||||||
assertEquals(tableOfContents.getMainSections().subList(1, 9)
|
assertEquals(tableOfContents.getMainSections().subList(1, 9)
|
||||||
.stream()
|
.stream()
|
||||||
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
|
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
|
||||||
@ -135,7 +135,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
|
|
||||||
List<SemanticNode> childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection();
|
List<SemanticNode> childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection();
|
||||||
|
|
||||||
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 10);
|
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9);
|
||||||
assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9)
|
assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9)
|
||||||
.stream()
|
.stream()
|
||||||
.map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))
|
.map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user