RED-1045: Enabled to redact in headers and footers
This commit is contained in:
parent
154e09b843
commit
577db37b11
@ -18,6 +18,8 @@ public class Document {
|
||||
|
||||
private List<Page> pages = new ArrayList<>();
|
||||
private List<Paragraph> paragraphs = new ArrayList<>();
|
||||
private List<Header> headers = new ArrayList<>();
|
||||
private List<Footer> footers = new ArrayList<>();
|
||||
private Map<Integer, List<Entity>> entities = new HashMap<>();
|
||||
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
|
||||
private FloatFrequencyCounter fontSizeCounter= new FloatFrequencyCounter();
|
||||
|
||||
@ -0,0 +1,24 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class Footer {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class Header {
|
||||
|
||||
private List<TextBlock> textBlocks;
|
||||
|
||||
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
textBlocks.forEach(block -> searchableText.addAll(block.getSequences()));
|
||||
return searchableText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -19,6 +19,8 @@ import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
@ -113,7 +115,18 @@ public class EntityRedactionService {
|
||||
}
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
sectionSearchableTextPairs.add(processText(paragraph, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
sectionSearchableTextPairs.add(processText(paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph
|
||||
.getHeadline(), manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
for (Header header : classifiedDoc.getHeaders()) {
|
||||
sectionSearchableTextPairs.add(processText(header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
for (Footer footer : classifiedDoc.getFooters()) {
|
||||
sectionSearchableTextPairs.add(processText(footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
@ -253,14 +266,13 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
|
||||
private SectionSearchableTextPair processText(Paragraph paragraph, ManualRedactions manualRedactions,
|
||||
private SectionSearchableTextPair processText(SearchableText searchableText, List<TextBlock> paragraphTextBlocks,
|
||||
String headline, ManualRedactions manualRedactions,
|
||||
AtomicInteger sectionNumber, Dictionary dictionary, boolean local,
|
||||
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
|
||||
|
||||
SearchableText searchableText = paragraph.getSearchableText();
|
||||
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber
|
||||
.intValue());
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber.intValue(), dictionary, local);
|
||||
addSectionToManualRedactions(paragraphTextBlocks, manualRedactions, headline, sectionNumber.intValue());
|
||||
Set<Entity> entities = findEntities(searchableText, headline, sectionNumber.intValue(), dictionary, local);
|
||||
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
|
||||
|
||||
return new SectionSearchableTextPair(Section.builder()
|
||||
@ -271,7 +283,7 @@ public class EntityRedactionService {
|
||||
.collect(Collectors.toSet()) : entities)
|
||||
.text(searchableText.getAsStringWithLinebreaks())
|
||||
.searchText(searchableText.toString())
|
||||
.headline(paragraph.getHeadline())
|
||||
.headline(headline)
|
||||
.sectionNumber(sectionNumber.intValue())
|
||||
.searchableText(searchableText)
|
||||
.dictionary(dictionary)
|
||||
|
||||
@ -10,6 +10,8 @@ import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
@ -24,25 +26,39 @@ public class SectionsBuilderService {
|
||||
|
||||
List<AbstractTextContainer> chunkWords = new ArrayList<>();
|
||||
List<Paragraph> chunkBlockList = new ArrayList<>();
|
||||
List<Header> headers = new ArrayList<>();
|
||||
List<Footer> footers = new ArrayList<>();
|
||||
|
||||
AbstractTextContainer prev = null;
|
||||
|
||||
String lastHeadline = "";
|
||||
Table previousTable = null;
|
||||
for (Page page : document.getPages()) {
|
||||
List<TextBlock> header = new ArrayList<>();
|
||||
List<TextBlock> footer = new ArrayList<>();
|
||||
for (AbstractTextContainer current : page.getTextBlocks()) {
|
||||
|
||||
if (current.getClassification() == null || current.getClassification()
|
||||
.equals("Header") || current.getClassification().equals("Footer")) {
|
||||
if (current.getClassification() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
current.setPage(page.getPageNumber());
|
||||
|
||||
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
|
||||
if (current.getClassification().equals("Header")) {
|
||||
header.add((TextBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals("Footer")) {
|
||||
footer.add((TextBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification()
|
||||
.startsWith("H ") || !document.isHeadlines()) {
|
||||
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
if(document.isHeadlines()) {
|
||||
if (document.isHeadlines()) {
|
||||
lastHeadline = current.getText();
|
||||
}
|
||||
chunkBlockList.add(chunkBlock);
|
||||
@ -60,6 +76,8 @@ public class SectionsBuilderService {
|
||||
chunkWords.add(current);
|
||||
prev = current;
|
||||
}
|
||||
headers.add(new Header(header));
|
||||
footers.add(new Footer(footer));
|
||||
}
|
||||
|
||||
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
@ -67,6 +85,8 @@ public class SectionsBuilderService {
|
||||
chunkBlockList.add(chunkBlock);
|
||||
|
||||
document.setParagraphs(chunkBlockList);
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
}
|
||||
|
||||
|
||||
@ -175,9 +195,9 @@ public class SectionsBuilderService {
|
||||
|
||||
private boolean hasInvalidHeaderInformation(Table table) {
|
||||
|
||||
return table.getRows().stream()
|
||||
.flatMap(row -> row.stream()
|
||||
.filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
|
||||
.findAny()
|
||||
.isEmpty();
|
||||
|
||||
@ -188,7 +208,7 @@ public class SectionsBuilderService {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
if(row.size() == 1){
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
boolean allNonHeader = true;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user