Tables with only 2 column are treated as on text

This commit is contained in:
Dominique Eifländer 2021-01-05 12:14:42 +01:00
parent 22f609a93a
commit 599c7bd6e4

View File

@ -6,6 +6,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@ -100,85 +101,22 @@ public class EntityRedactionService {
boolean local, Map<Integer, Set<Entity>> hintsPerSectionNumber) {
Set<Entity> documentEntities = new HashSet<>();
int sectionNumber = 1;
AtomicInteger sectionNumber = new AtomicInteger(1);
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
SearchableText searchableText = paragraph.getSearchableText();
List<Table> tables = paragraph.getTables();
for (Table table : tables) {
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
int cellStart = start;
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
for (TextBlock textBlock : cell.getTextBlocks()) {
// TODO avoid cell overlap merging.
searchableRow.addAll(textBlock.getSequences());
}
cellStarts.add(cellStart);
start = start + cell.toString().trim().length() + 1;
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary, local);
surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream
.concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber).stream())
.collect(Collectors.toSet()) : rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString())
.headline(table.getHeadline())
.sectionNumber(sectionNumber)
.tabularData(tabularData)
.searchableText(searchableRow)
.dictionary(dictionary)
.build(), searchableRow));
sectionNumber++;
if (table.getColCount() == 2) {
sectionSearchableTextPairs.addAll(processTableAsOneText(table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
} else {
sectionSearchableTextPairs.addAll(processTablePerRow(table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
}
sectionNumber++;
sectionNumber.incrementAndGet();
}
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber);
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary, local);
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream
.concat(entities.stream(), hintsPerSectionNumber.get(sectionNumber).stream())
.collect(Collectors.toSet()) : entities)
.text(searchableText.getAsStringWithLinebreaks())
.searchText(searchableText.toString())
.headline(paragraph.getHeadline())
.sectionNumber(sectionNumber)
.searchableText(searchableText)
.dictionary(dictionary)
.build(), searchableText));
sectionNumber++;
sectionSearchableTextPairs.add(processText(paragraph, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
sectionNumber.incrementAndGet();
}
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
@ -214,6 +152,128 @@ public class EntityRedactionService {
}
private List<SectionSearchableTextPair> processTablePerRow(Table table, ManualRedactions manualRedactions,
AtomicInteger sectionNumber, Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
int cellStart = start;
cell.getHeaderCells().forEach(headerCell -> {
StringBuilder headerBuilder = new StringBuilder();
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
String headerName = headerBuilder.toString()
.replaceAll("\n", "")
.replaceAll(" ", "")
.replaceAll("-", "");
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
for (TextBlock textBlock : cell.getTextBlocks()) {
// TODO avoid cell overlap merging.
searchableRow.addAll(textBlock.getSequences());
}
cellStarts.add(cellStart);
start = start + cell.toString().trim().length() + 1;
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream
.concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
.collect(Collectors.toSet()) : rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString())
.headline(table.getHeadline())
.sectionNumber(sectionNumber.intValue())
.tabularData(tabularData)
.searchableText(searchableRow)
.dictionary(dictionary)
.build(), searchableRow));
sectionNumber.incrementAndGet();
}
return sectionSearchableTextPairs;
}
private List<SectionSearchableTextPair> processTableAsOneText(Table table, ManualRedactions manualRedactions,
AtomicInteger sectionNumber, Dictionary dictionary,
boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
SearchableText entireTableText = new SearchableText();
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
for (TextBlock textBlock : cell.getTextBlocks()) {
entireTableText.addAll(textBlock.getSequences());
}
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
}
}
Set<Entity> rowEntities = findEntities(entireTableText, table.getHeadline(), sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(rowEntities, entireTableText, dictionary);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream.concat(rowEntities
.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
.collect(Collectors.toSet()) : rowEntities)
.text(entireTableText.getAsStringWithLinebreaks())
.searchText(entireTableText.toString())
.headline(table.getHeadline())
.sectionNumber(sectionNumber.intValue())
.searchableText(entireTableText)
.dictionary(dictionary)
.build(), entireTableText));
return sectionSearchableTextPairs;
}
private SectionSearchableTextPair processText(Paragraph paragraph, ManualRedactions manualRedactions,
AtomicInteger sectionNumber, Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
SearchableText searchableText = paragraph.getSearchableText();
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber.intValue());
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber.intValue(), dictionary, local);
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
return new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream.concat(entities
.stream(), hintsPerSectionNumber.get(sectionNumber).stream())
.collect(Collectors.toSet()) : entities)
.text(searchableText.getAsStringWithLinebreaks())
.searchText(searchableText.toString())
.headline(paragraph.getHeadline())
.sectionNumber(sectionNumber.intValue())
.searchableText(searchableText)
.dictionary(dictionary)
.build(), searchableText);
}
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
Dictionary dictionary, boolean local) {