RED-1045: Enabled to redact in headers and footers

This commit is contained in:
Dominique Eifländer 2021-02-05 13:05:12 +01:00
parent 154e09b843
commit 577db37b11
5 changed files with 97 additions and 15 deletions

View File

@ -18,6 +18,8 @@ public class Document {
private List<Page> pages = new ArrayList<>();
private List<Paragraph> paragraphs = new ArrayList<>();
private List<Header> headers = new ArrayList<>();
private List<Footer> footers = new ArrayList<>();
private Map<Integer, List<Entity>> entities = new HashMap<>();
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
private FloatFrequencyCounter fontSizeCounter= new FloatFrequencyCounter();

View File

@ -0,0 +1,24 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class Footer {
private List<TextBlock> textBlocks;
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
return searchableText;
}
}

View File

@ -0,0 +1,24 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.List;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class Header {
private List<TextBlock> textBlocks;
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
return searchableText;
}
}

View File

@ -19,6 +19,8 @@ import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
@ -113,7 +115,18 @@ public class EntityRedactionService {
}
sectionNumber.incrementAndGet();
}
sectionSearchableTextPairs.add(processText(paragraph, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
sectionSearchableTextPairs.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph
.getHeadline(), manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
sectionNumber.incrementAndGet();
}
for (Header header : classifiedDoc.getHeaders()) {
sectionSearchableTextPairs.add(processText(header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
sectionNumber.incrementAndGet();
}
for (Footer footer : classifiedDoc.getFooters()) {
sectionSearchableTextPairs.add(processText(footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
sectionNumber.incrementAndGet();
}
@ -253,14 +266,13 @@ public class EntityRedactionService {
}
private SectionSearchableTextPair processText(Paragraph paragraph, ManualRedactions manualRedactions,
private SectionSearchableTextPair processText(SearchableText searchableText, List<TextBlock> paragraphTextBlocks,
String headline, ManualRedactions manualRedactions,
AtomicInteger sectionNumber, Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
SearchableText searchableText = paragraph.getSearchableText();
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber
.intValue());
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber.intValue(), dictionary, local);
addSectionToManualRedactions(paragraphTextBlocks, manualRedactions, headline, sectionNumber.intValue());
Set<Entity> entities = findEntities(searchableText, headline, sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
return new SectionSearchableTextPair(Section.builder()
@ -271,7 +283,7 @@ public class EntityRedactionService {
.collect(Collectors.toSet()) : entities)
.text(searchableText.getAsStringWithLinebreaks())
.searchText(searchableText.toString())
.headline(paragraph.getHeadline())
.headline(headline)
.sectionNumber(sectionNumber.intValue())
.searchableText(searchableText)
.dictionary(dictionary)

View File

@ -10,6 +10,8 @@ import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
@ -24,25 +26,39 @@ public class SectionsBuilderService {
List<AbstractTextContainer> chunkWords = new ArrayList<>();
List<Paragraph> chunkBlockList = new ArrayList<>();
List<Header> headers = new ArrayList<>();
List<Footer> footers = new ArrayList<>();
AbstractTextContainer prev = null;
String lastHeadline = "";
Table previousTable = null;
for (Page page : document.getPages()) {
List<TextBlock> header = new ArrayList<>();
List<TextBlock> footer = new ArrayList<>();
for (AbstractTextContainer current : page.getTextBlocks()) {
if (current.getClassification() == null || current.getClassification()
.equals("Header") || current.getClassification().equals("Footer")) {
if (current.getClassification() == null) {
continue;
}
current.setPage(page.getPageNumber());
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
if (current.getClassification().equals("Header")) {
header.add((TextBlock) current);
continue;
}
if (current.getClassification().equals("Footer")) {
footer.add((TextBlock) current);
continue;
}
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification()
.startsWith("H ") || !document.isHeadlines()) {
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
if(document.isHeadlines()) {
if (document.isHeadlines()) {
lastHeadline = current.getText();
}
chunkBlockList.add(chunkBlock);
@ -60,6 +76,8 @@ public class SectionsBuilderService {
chunkWords.add(current);
prev = current;
}
headers.add(new Header(header));
footers.add(new Footer(footer));
}
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
@ -67,6 +85,8 @@ public class SectionsBuilderService {
chunkBlockList.add(chunkBlock);
document.setParagraphs(chunkBlockList);
document.setHeaders(headers);
document.setFooters(footers);
}
@ -175,9 +195,9 @@ public class SectionsBuilderService {
private boolean hasInvalidHeaderInformation(Table table) {
return table.getRows().stream()
.flatMap(row -> row.stream()
.filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
return table.getRows()
.stream()
.flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
.findAny()
.isEmpty();
@ -188,7 +208,7 @@ public class SectionsBuilderService {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
if(row.size() == 1){
if (row.size() == 1) {
continue;
}
boolean allNonHeader = true;