RED-7461: improve header/footer recognition

This commit is contained in:
Kilian Schuettler 2023-08-21 16:20:23 +02:00
parent 3722fff476
commit 2b15fd1d3c
11 changed files with 116 additions and 91 deletions

View File

@ -103,7 +103,7 @@ public class LayoutParsingPipeline {
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
try (var out = new ByteArrayOutputStream()) {
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out);
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
}

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
@ -13,6 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
@ -20,65 +22,61 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
@Service
public class BodyTextFrameService {
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
setBodyTextFrameAdjustedToPage(page, updatedBodyTextFrame, updatedBodyTextFrame);
}
}
/*
private Rectangle calculateBodyTextFrameByRulings(List<ClassificationPage> pages) {
private Rectangle getBodyTextFrameFromRulings(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
Map<ClassificationPage, List<Ruling>> potentialHeaderRulingsPerPage = new HashMap<>();
Map<ClassificationPage, List<Ruling>> potentialFooterRulingsPerPage = new HashMap<>();
for (var page : pages) {
potentialHeaderRulingsPerPage.put(page,
page.getCleanRulings()
.getHorizontal()
.stream()
.filter(ruling -> ruling.getY1() > page.getPageHeight() * 0.8)
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
.toList());
potentialFooterRulingsPerPage.put(page,
page.getCleanRulings()
.getHorizontal()
.stream()
.filter(ruling -> ruling.getY1() < page.getPageHeight() * 0.2)
.filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth())
.toList());
List<Ruling> potentialFooterRulings = getPotentialFooterRulings(page);
List<Ruling> potentialHeaderRulings = getPotentialHeaderRulings(page);
var x = bodyTextFrame.getTopLeft().getX();
var y = bodyTextFrame.getTopLeft().getY();
var w = bodyTextFrame.getWidth();
var h = bodyTextFrame.getHeight();
if (!potentialFooterRulings.isEmpty()) {
h = y + h - potentialFooterRulings.get(0).getTop();
y = potentialFooterRulings.get(0).getTop();
}
Optional<Ruling> headerRuling = potentialHeaderRulingsPerPage.values()
.stream()
.flatMap(Collection::stream)
.filter(ruling -> potentialHeaderRulingsPerPage.values()
.stream()
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
.min(Comparator.comparingDouble(Ruling::getY1));
Optional<Ruling> footerRuling = potentialFooterRulingsPerPage.values()
.stream()
.flatMap(Collection::stream)
.filter(ruling -> potentialHeaderRulingsPerPage.values()
.stream()
.filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches))
.count() > pages.size() * RULING_THRESHOLD_FACTOR)
.max(Comparator.comparingDouble(Ruling::getY1));
double maxY = headerRuling.isPresent() ? headerRuling.get().y1 : pages.stream().mapToDouble(ClassificationPage::getPageHeight).max().orElse(Double.MAX_VALUE);
double minY = footerRuling.map(ruling -> ruling.y1).orElse(0F);
double maxX = pages.stream().mapToDouble(ClassificationPage::getPageWidth).max().orElse(Double.MAX_VALUE);
return new Rectangle(new Point((float) maxX, (float) maxY), (float) 0, (float) minY, -1);
if (!potentialHeaderRulings.isEmpty()) {
h = potentialHeaderRulings.get(0).getBottom() - bodyTextFrame.getTopLeft().getY();
}
return new Rectangle(new Point(x, y), w, h, page.getPageNumber());
}
private List<Ruling> getPotentialFooterRulings(ClassificationPage page) {
return page.getCleanRulings()
.getHorizontal()
.stream()
.filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD)
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
.sorted(Comparator.comparingDouble(Ruling::getTop))
.toList();
}
private List<Ruling> getPotentialHeaderRulings(ClassificationPage page) {
return page.getCleanRulings()
.getHorizontal()
.stream()
.filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD))
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
.sorted(Comparator.comparingDouble(Ruling::getBottom).reversed())
.toList();
}
*/
/**
@ -129,10 +127,10 @@ public class BodyTextFrameService {
* @param landscape Calculate for landscape or portrait
* @return Rectangle of the text frame
*/
private Rectangle calculateBodyTextFrame(List<ClassificationPage> pages,
FloatFrequencyCounter documentFontSizeCounter,
boolean landscape,
LayoutParsingType layoutParsingType) {
protected Rectangle calculateBodyTextFrame(List<ClassificationPage> pages,
FloatFrequencyCounter documentFontSizeCounter,
boolean landscape,
LayoutParsingType layoutParsingType) {
float approximateHeaderLineCount;
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {

View File

@ -39,8 +39,8 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class ViewerDocumentService {
private static final String layerName = "Layout grid";
private static final String LAYER_NAME = "Layout grid";
private static final int FONT_SIZE = 10;
public static final float LINE_WIDTH = 1f;
@ -48,14 +48,14 @@ public class ViewerDocumentService {
@SneakyThrows
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream) {
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
log.info("Start Viewer Document Creation");
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate);
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
PDFont font = PDType1Font.HELVETICA;
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
@ -119,6 +119,7 @@ public class ViewerDocumentService {
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
}
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
log.info("Saved Viewer Document");
}
@ -145,7 +146,7 @@ public class ViewerDocumentService {
}
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate) {
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate, boolean layerVisibilityDefaultValue) {
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
PDOptionalContentProperties ocprops = catalog.getOCProperties();
@ -154,13 +155,13 @@ public class ViewerDocumentService {
catalog.setOCProperties(ocprops);
}
PDOptionalContentGroup layer = null;
if (ocprops.hasGroup(layerName)) {
layer = ocprops.getGroup(layerName);
if (ocprops.hasGroup(LAYER_NAME)) {
layer = ocprops.getGroup(LAYER_NAME);
} else {
layer = new PDOptionalContentGroup(layerName);
layer = new PDOptionalContentGroup(LAYER_NAME);
ocprops.addGroup(layer);
}
ocprops.setGroupEnabled(layer, false);
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
dictionariesToUpdate.add(catalog.getCOSObject());
return layer;
}

View File

@ -12,10 +12,11 @@ import org.springframework.core.io.ClassPathResource;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import lombok.SneakyThrows;
public class DocumentDataTests extends BuildDocumentGraphTest{
public class DocumentDataTests extends BuildDocumentTest {
@Test
@SneakyThrows
public void createDocumentDataForAllFiles() {

View File

@ -20,10 +20,11 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import lombok.SneakyThrows;
public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
@Test
@SneakyThrows

View File

@ -16,11 +16,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Ta
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentGraphMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.SneakyThrows;
public class DocumentGraphMappingTest extends BuildDocumentGraphTest {
public class DocumentGraphMappingTest extends BuildDocumentTest {
@Test
@SneakyThrows

View File

@ -13,13 +13,14 @@ import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest {
public class DocumentGraphVisualizationTest extends BuildDocumentTest {
@Test
@SneakyThrows

View File

@ -12,10 +12,11 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentGraphTest {
public class ViewerDocumentTest extends BuildDocumentTest {
@Test
@Disabled
@ -28,7 +29,7 @@ public class ViewerDocumentTest extends BuildDocumentGraphTest {
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) {
viewerDocumentService.createViewerDocument(pdDocument, document, out);
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
}
}

View File

@ -0,0 +1,31 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class BodyTextFrameServiceTest extends BuildDocumentTest {
@Test
@SneakyThrows
public void testCalculateBodyTextFrame() {
String filename = "files/211.pdf";
String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf";
ClassificationDocument document = parseLayout(filename, LayoutParsingType.TAAS);
PdfDraw.drawRectanglesPerPage(filename,
document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(),
outputFilename);
}
}

View File

@ -5,7 +5,6 @@ import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
@ -19,13 +18,13 @@ import lombok.SneakyThrows;
public class RulingCleaningServiceTest {
@Test
@Disabled
// @Disabled
@SneakyThrows
public void textRulingExtraction() {
String fileName = "files/BASF/2013-1110704.pdf";
String fileName = "files/211.pdf";
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents("files/BASF/2013-1110704.pdf");
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();

View File

@ -1,39 +1,35 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import static org.junit.jupiter.api.Assertions.assertEquals;
package com.knecon.fforesight.service.layoutparser.server.utils;
import java.io.InputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import lombok.SneakyThrows;
public class BuildDocumentGraphTest extends AbstractTest {
public abstract class BuildDocumentTest extends AbstractTest {
@Autowired
protected LayoutParsingPipeline layoutParsingPipeline;
@Test
@Disabled
public void buildMetolachlor() {
@SneakyThrows
protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) {
Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
assertEquals(221, documentGraph.getPages().size());
assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count());
assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count());
ClassPathResource fileResource = new ClassPathResource(filename);
prepareStorage(filename);
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) {
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
}
}
@ -52,14 +48,9 @@ public class BuildDocumentGraphTest extends AbstractTest {
} else {
prepareStorage(filename);
}
ClassPathResource fileResource = new ClassPathResource(filename);
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) {
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(layoutParsingType,
pdDocument,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse()));
}
return DocumentGraphFactory.buildDocumentGraph(parseLayout(filename, layoutParsingType));
}
}