RED-3974: Use first row as header if header detection does not find a header

This commit is contained in:
deiflaender 2022-08-11 11:44:31 +02:00
parent 85c44374d9
commit f84a366328
4 changed files with 96 additions and 31 deletions

View File

@ -1,19 +1,35 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.*;
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@ -35,8 +51,7 @@ public class SectionTextBuilderService {
}
sectionNumber.incrementAndGet();
}
sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph
.getImages()));
sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph.getImages()));
sectionNumber.incrementAndGet();
}
@ -62,20 +77,26 @@ public class SectionTextBuilderService {
private List<SectionText> processTablePerRow(Table table, AtomicInteger sectionNumber) {
List<SectionText> sectionTexts = new ArrayList<>();
for (List<Cell> row : table.getRows()) {
boolean hasHeader = table.getRows()
.stream()
.anyMatch(row -> row.stream()
.anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty()));
for (int rowNum = 0; rowNum < table.getRows().size(); rowNum++) {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
SectionText sectionText = new SectionText();
for (Cell cell : row) {
for (int cellNum = 0; cellNum < table.getRows().get(rowNum).size(); cellNum++) {
Cell cell = table.getRows().get(rowNum).get(cellNum);
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
@ -86,16 +107,17 @@ public class SectionTextBuilderService {
int cellStart = start;
if (!cell.isHeaderCell()) {
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
if (hasHeader) {
cell.getHeaderCells().forEach(headerCell -> {
String headerName = buildHeaderName(headerCell);
sectionArea.setHeader(headerName);
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
} else {
String headerName = buildHeaderName(table.getRows().get(0).get(cellNum));
sectionArea.setHeader(headerName);
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
}
}
for (TextBlock textBlock : cell.getTextBlocks()) {
@ -122,6 +144,14 @@ public class SectionTextBuilderService {
}
private String buildHeaderName(Cell cell) {
StringBuilder headerBuilder = new StringBuilder();
cell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
return headerBuilder.toString().replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
}
private SectionText processTableAsOneText(Table table, AtomicInteger sectionNumber) {
SearchableText entireTableText = new SearchableText();
@ -130,14 +160,12 @@ public class SectionTextBuilderService {
List<Integer> cellStarts = new ArrayList<>();
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
@ -149,7 +177,6 @@ public class SectionTextBuilderService {
entireTableText.addAll(textBlock.getSequences());
}
cellStarts.add(start);
start = start + cell.toString().trim().length() + 1;
}
@ -170,8 +197,7 @@ public class SectionTextBuilderService {
SectionText sectionText = new SectionText();
for (TextBlock paragraphTextBlock : paragraphTextBlocks) {
SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock
.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null);
SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null);
sectionText.getSectionAreas().add(sectionArea);
}

View File

@ -882,6 +882,36 @@ public class RedactionIntegrationTest {
}
@Test
public void testTableHeader() throws IOException {
System.out.println("testTableHeader");
long start = System.currentTimeMillis();
AnalyzeRequest request = prepareStorage("files/Minimal Examples/NoHeaderTable.pdf");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
assertThat(redactionLog.getRedactionLogEntry().size()).isEqualTo(5);
long end = System.currentTimeMillis();
System.out.println("duration: " + (end - start));
System.out.println("numberOfPages: " + result.getNumberOfPages());
}
@Test
public void testFindDictionaryEntryInResizedEntryPosition() throws IOException {

View File

@ -381,4 +381,13 @@ rule "30: Ignore dossier_redactions if confidential"
Section(!fileAttributeByLabelEqualsIgnoreCase("Confidentiality","confidential") && matchesType("dossier_redactions"));
then
section.ignore("dossier_redactions");
end
// ex. "New Rules for PAD" - "Annex A" - page 21, page 35 (table without header), page 38 (in-text)
// https://www.regexplanet.com/share/index.html?share=yyyypb71xkr
rule "101: Redact CAS numbers"
when
Section(hasTableHeader("Sample #"))
then
section.redactByRegEx("\\b[1-9]{1}[0-9]{1,5}-\\d{2}-\\R?\\d{1,2}\\b", true, 0, "PII", 101, "compound/sample identifier", "Article 4(2) first indent of Regulation No. 1049/2001");
end