Pull request #450: RED-3974: Use first row as header if header detection does not find a header
Merge in RED/redaction-service from RED-3974 to master * commit 'cba81ce061df867936314dcbdbc565248c8db006': RED-3974: Refactored processTablePerRow RED-3974: Use first row as header if header detection does not find a header
This commit is contained in:
commit
07aaa9722a
@ -27,11 +27,15 @@ public class SectionText {
|
||||
private boolean isTable;
|
||||
private String headline;
|
||||
|
||||
@Builder.Default
|
||||
private List<SectionArea> sectionAreas = new ArrayList<>();
|
||||
@Builder.Default
|
||||
private Set<Image> images = new HashSet<>();
|
||||
|
||||
@Builder.Default
|
||||
private List<TextBlock> textBlocks = new ArrayList<>();
|
||||
@Builder.Default
|
||||
private Map<String, CellValue> tabularData = new HashMap<>();
|
||||
@Builder.Default
|
||||
private List<Integer> cellStarts = new ArrayList<>();
|
||||
|
||||
|
||||
|
||||
@ -194,7 +194,7 @@ public class SearchableText {
|
||||
}
|
||||
|
||||
|
||||
public String buildString(List<TextPositionSequence> sequences) {
|
||||
public static String buildString(List<TextPositionSequence> sequences) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
|
||||
@ -1,19 +1,35 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -35,8 +51,7 @@ public class SectionTextBuilderService {
|
||||
}
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph
|
||||
.getImages()));
|
||||
sectionTexts.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph.getHeadline(), sectionNumber, paragraph.getImages()));
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
@ -62,58 +77,48 @@ public class SectionTextBuilderService {
|
||||
private List<SectionText> processTablePerRow(Table table, AtomicInteger sectionNumber) {
|
||||
|
||||
List<SectionText> sectionTexts = new ArrayList<>();
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
SearchableText searchableRow = new SearchableText();
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
int start = 0;
|
||||
List<Integer> cellStarts = new ArrayList<>();
|
||||
SectionText sectionText = new SectionText();
|
||||
for (Cell cell : row) {
|
||||
boolean hasHeader = hasTableHeader(table);
|
||||
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
|
||||
List<TextBlock> textBlocks = new ArrayList<>();
|
||||
List<SectionArea> areas = new ArrayList<>();
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
List<Integer> startOffsets = new ArrayList<>();
|
||||
|
||||
int startOffset = 0;
|
||||
for (int cellNum = 0; cellNum < row.size(); cellNum++) {
|
||||
|
||||
Cell cell = row.get(cellNum);
|
||||
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
|
||||
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
|
||||
.get(0)
|
||||
.getSequences()
|
||||
.get(0)
|
||||
.getPage(), null);
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
|
||||
|
||||
int cellStart = start;
|
||||
SectionArea sectionArea = getSectionArea(cell);
|
||||
areas.add(sectionArea);
|
||||
|
||||
if (!cell.isHeaderCell()) {
|
||||
cell.getHeaderCells().forEach(headerCell -> {
|
||||
StringBuilder headerBuilder = new StringBuilder();
|
||||
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
|
||||
String headerName = headerBuilder.toString()
|
||||
.replaceAll("\n", "")
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
sectionArea.setHeader(headerName);
|
||||
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
|
||||
});
|
||||
String headerName = getHeaderName(hasHeader, cell, table.getRows().get(0).get(cellNum));
|
||||
sectionArea.setHeader(headerName);
|
||||
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), startOffset));
|
||||
}
|
||||
|
||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
// TODO avoid cell overlap merging.
|
||||
searchableRow.addAll(textBlock.getSequences());
|
||||
}
|
||||
cellStarts.add(cellStart);
|
||||
start = start + cell.toString().trim().length() + 1;
|
||||
textBlocks.addAll(cell.getTextBlocks());
|
||||
|
||||
startOffsets.add(startOffset);
|
||||
startOffset = startOffset + cell.toString().trim().length() + 1;
|
||||
}
|
||||
|
||||
sectionText.setText(searchableRow.toString());
|
||||
sectionText.setHeadline(table.getHeadline());
|
||||
sectionText.setSectionNumber(sectionNumber.intValue());
|
||||
sectionText.setTable(true);
|
||||
sectionText.setTabularData(tabularData);
|
||||
sectionText.setCellStarts(cellStarts);
|
||||
sectionTexts.add(sectionText);
|
||||
sectionTexts.add(SectionText.builder()
|
||||
.text(getRowText(textBlocks))
|
||||
.headline(table.getHeadline())
|
||||
.sectionNumber(sectionNumber.intValue())
|
||||
.isTable(true)
|
||||
.tabularData(tabularData)
|
||||
.cellStarts(startOffsets)
|
||||
.textBlocks(textBlocks)
|
||||
.sectionAreas(areas)
|
||||
.build());
|
||||
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
@ -122,6 +127,49 @@ public class SectionTextBuilderService {
|
||||
}
|
||||
|
||||
|
||||
public String getRowText(List<TextBlock> rowTextBlocks) {
|
||||
|
||||
return SearchableText.buildString(rowTextBlocks.stream()
|
||||
.map(textBlock -> textBlock.getSequences())
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
||||
private boolean hasTableHeader(Table table) {
|
||||
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.anyMatch(row -> row.stream()
|
||||
.anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty()));
|
||||
}
|
||||
|
||||
|
||||
private SectionArea getSectionArea(Cell cell) {
|
||||
|
||||
return new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
|
||||
.get(0)
|
||||
.getSequences()
|
||||
.get(0)
|
||||
.getPage(), null);
|
||||
}
|
||||
|
||||
|
||||
private String getHeaderName(boolean hasHeader, Cell currentCell, Cell cellInFirstRow) {
|
||||
|
||||
return hasHeader ? buildHeaderName(currentCell.getHeaderCells()
|
||||
.get(currentCell.getHeaderCells().size() - 1)) : buildHeaderName(cellInFirstRow);
|
||||
}
|
||||
|
||||
|
||||
private String buildHeaderName(Cell cell) {
|
||||
|
||||
StringBuilder headerBuilder = new StringBuilder();
|
||||
cell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
|
||||
return headerBuilder.toString().replaceAll("\n", "").replaceAll(" ", "").replaceAll("-", "");
|
||||
}
|
||||
|
||||
|
||||
private SectionText processTableAsOneText(Table table, AtomicInteger sectionNumber) {
|
||||
|
||||
SearchableText entireTableText = new SearchableText();
|
||||
@ -130,14 +178,12 @@ public class SectionTextBuilderService {
|
||||
List<Integer> cellStarts = new ArrayList<>();
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
|
||||
|
||||
for (Cell cell : row) {
|
||||
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
|
||||
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
|
||||
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
|
||||
.get(0)
|
||||
.getSequences()
|
||||
.get(0)
|
||||
@ -149,7 +195,6 @@ public class SectionTextBuilderService {
|
||||
entireTableText.addAll(textBlock.getSequences());
|
||||
}
|
||||
|
||||
|
||||
cellStarts.add(start);
|
||||
start = start + cell.toString().trim().length() + 1;
|
||||
}
|
||||
@ -170,8 +215,7 @@ public class SectionTextBuilderService {
|
||||
|
||||
SectionText sectionText = new SectionText();
|
||||
for (TextBlock paragraphTextBlock : paragraphTextBlocks) {
|
||||
SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock
|
||||
.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null);
|
||||
SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock.getWidth(), paragraphTextBlock.getHeight(), paragraphTextBlock.getPage(), null);
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
}
|
||||
|
||||
|
||||
@ -882,6 +882,36 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testTableHeader() throws IOException {
|
||||
|
||||
System.out.println("testTableHeader");
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
AnalyzeRequest request = prepareStorage("files/Minimal Examples/NoHeaderTable.pdf");
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
AnalyzeResult result = analyzeService.analyze(request);
|
||||
|
||||
AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder()
|
||||
.dossierId(TEST_DOSSIER_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream(OsUtils.getTemporaryDirectory() + "/Annotated.pdf")) {
|
||||
fileOutputStream.write(annotateResponse.getDocument());
|
||||
}
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
|
||||
assertThat(redactionLog.getRedactionLogEntry().size()).isEqualTo(5);
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
|
||||
System.out.println("duration: " + (end - start));
|
||||
System.out.println("numberOfPages: " + result.getNumberOfPages());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void testFindDictionaryEntryInResizedEntryPosition() throws IOException {
|
||||
|
||||
|
||||
@ -381,4 +381,13 @@ rule "30: Ignore dossier_redactions if confidential"
|
||||
Section(!fileAttributeByLabelEqualsIgnoreCase("Confidentiality","confidential") && matchesType("dossier_redactions"));
|
||||
then
|
||||
section.ignore("dossier_redactions");
|
||||
end
|
||||
|
||||
// ex. "New Rules for PAD" - "Annex A" - page 21, page 35 (table without header), page 38 (in-text)
|
||||
// https://www.regexplanet.com/share/index.html?share=yyyypb71xkr
|
||||
rule "101: Redact CAS numbers"
|
||||
when
|
||||
Section(hasTableHeader("Sample #"))
|
||||
then
|
||||
section.redactByRegEx("\\b[1-9]{1}[0-9]{1,5}-\\d{2}-\\R?\\d{1,2}\\b", true, 0, "PII", 101, "compound/sample identifier", "Article 4(2) first indent of Regulation No. 1049/2001");
|
||||
end
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user