Tables with only 2 column are treated as on text
This commit is contained in:
parent
22f609a93a
commit
599c7bd6e4
@ -6,6 +6,7 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
@ -100,85 +101,22 @@ public class EntityRedactionService {
|
||||
boolean local, Map<Integer, Set<Entity>> hintsPerSectionNumber) {
|
||||
|
||||
Set<Entity> documentEntities = new HashSet<>();
|
||||
int sectionNumber = 1;
|
||||
|
||||
AtomicInteger sectionNumber = new AtomicInteger(1);
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
|
||||
|
||||
SearchableText searchableText = paragraph.getSearchableText();
|
||||
|
||||
List<Table> tables = paragraph.getTables();
|
||||
|
||||
for (Table table : tables) {
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
SearchableText searchableRow = new SearchableText();
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
int start = 0;
|
||||
List<Integer> cellStarts = new ArrayList<>();
|
||||
for (Cell cell : row) {
|
||||
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
|
||||
continue;
|
||||
}
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber);
|
||||
int cellStart = start;
|
||||
cell.getHeaderCells().forEach(headerCell -> {
|
||||
StringBuilder headerBuilder = new StringBuilder();
|
||||
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
|
||||
String headerName = headerBuilder.toString()
|
||||
.replaceAll("\n", "")
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
|
||||
});
|
||||
|
||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
// TODO avoid cell overlap merging.
|
||||
searchableRow.addAll(textBlock.getSequences());
|
||||
}
|
||||
cellStarts.add(cellStart);
|
||||
start = start + cell.toString().trim().length() + 1;
|
||||
}
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary, local);
|
||||
surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts);
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(local)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream
|
||||
.concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber).stream())
|
||||
.collect(Collectors.toSet()) : rowEntities)
|
||||
.text(searchableRow.getAsStringWithLinebreaks())
|
||||
.searchText(searchableRow.toString())
|
||||
.headline(table.getHeadline())
|
||||
.sectionNumber(sectionNumber)
|
||||
.tabularData(tabularData)
|
||||
.searchableText(searchableRow)
|
||||
.dictionary(dictionary)
|
||||
.build(), searchableRow));
|
||||
|
||||
sectionNumber++;
|
||||
if (table.getColCount() == 2) {
|
||||
sectionSearchableTextPairs.addAll(processTableAsOneText(table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
} else {
|
||||
sectionSearchableTextPairs.addAll(processTablePerRow(table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
}
|
||||
sectionNumber++;
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber);
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary, local);
|
||||
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(local)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream
|
||||
.concat(entities.stream(), hintsPerSectionNumber.get(sectionNumber).stream())
|
||||
.collect(Collectors.toSet()) : entities)
|
||||
.text(searchableText.getAsStringWithLinebreaks())
|
||||
.searchText(searchableText.toString())
|
||||
.headline(paragraph.getHeadline())
|
||||
.sectionNumber(sectionNumber)
|
||||
.searchableText(searchableText)
|
||||
.dictionary(dictionary)
|
||||
.build(), searchableText));
|
||||
|
||||
sectionNumber++;
|
||||
sectionSearchableTextPairs.add(processText(paragraph, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
|
||||
@ -214,6 +152,128 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
|
||||
private List<SectionSearchableTextPair> processTablePerRow(Table table, ManualRedactions manualRedactions,
|
||||
AtomicInteger sectionNumber, Dictionary dictionary, boolean local,
|
||||
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
|
||||
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
SearchableText searchableRow = new SearchableText();
|
||||
Map<String, CellValue> tabularData = new HashMap<>();
|
||||
int start = 0;
|
||||
List<Integer> cellStarts = new ArrayList<>();
|
||||
for (Cell cell : row) {
|
||||
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
|
||||
continue;
|
||||
}
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
|
||||
int cellStart = start;
|
||||
cell.getHeaderCells().forEach(headerCell -> {
|
||||
StringBuilder headerBuilder = new StringBuilder();
|
||||
headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText()));
|
||||
String headerName = headerBuilder.toString()
|
||||
.replaceAll("\n", "")
|
||||
.replaceAll(" ", "")
|
||||
.replaceAll("-", "");
|
||||
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
|
||||
});
|
||||
|
||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
// TODO avoid cell overlap merging.
|
||||
searchableRow.addAll(textBlock.getSequences());
|
||||
}
|
||||
cellStarts.add(cellStart);
|
||||
start = start + cell.toString().trim().length() + 1;
|
||||
}
|
||||
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber.intValue(), dictionary, local);
|
||||
surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts);
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(local)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream
|
||||
.concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
|
||||
.collect(Collectors.toSet()) : rowEntities)
|
||||
.text(searchableRow.getAsStringWithLinebreaks())
|
||||
.searchText(searchableRow.toString())
|
||||
.headline(table.getHeadline())
|
||||
.sectionNumber(sectionNumber.intValue())
|
||||
.tabularData(tabularData)
|
||||
.searchableText(searchableRow)
|
||||
.dictionary(dictionary)
|
||||
.build(), searchableRow));
|
||||
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
return sectionSearchableTextPairs;
|
||||
}
|
||||
|
||||
|
||||
private List<SectionSearchableTextPair> processTableAsOneText(Table table, ManualRedactions manualRedactions,
|
||||
AtomicInteger sectionNumber, Dictionary dictionary,
|
||||
boolean local,
|
||||
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
|
||||
|
||||
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
|
||||
SearchableText entireTableText = new SearchableText();
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
for (Cell cell : row) {
|
||||
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
|
||||
continue;
|
||||
}
|
||||
for (TextBlock textBlock : cell.getTextBlocks()) {
|
||||
entireTableText.addAll(textBlock.getSequences());
|
||||
}
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
|
||||
}
|
||||
}
|
||||
|
||||
Set<Entity> rowEntities = findEntities(entireTableText, table.getHeadline(), sectionNumber.intValue(), dictionary, local);
|
||||
surroundingWordsService.addSurroundingText(rowEntities, entireTableText, dictionary);
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(local)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream.concat(rowEntities
|
||||
.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream())
|
||||
.collect(Collectors.toSet()) : rowEntities)
|
||||
.text(entireTableText.getAsStringWithLinebreaks())
|
||||
.searchText(entireTableText.toString())
|
||||
.headline(table.getHeadline())
|
||||
.sectionNumber(sectionNumber.intValue())
|
||||
.searchableText(entireTableText)
|
||||
.dictionary(dictionary)
|
||||
.build(), entireTableText));
|
||||
|
||||
return sectionSearchableTextPairs;
|
||||
}
|
||||
|
||||
|
||||
private SectionSearchableTextPair processText(Paragraph paragraph, ManualRedactions manualRedactions,
|
||||
AtomicInteger sectionNumber, Dictionary dictionary, boolean local,
|
||||
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
|
||||
|
||||
SearchableText searchableText = paragraph.getSearchableText();
|
||||
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber.intValue());
|
||||
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber.intValue(), dictionary, local);
|
||||
surroundingWordsService.addSurroundingText(entities, searchableText, dictionary);
|
||||
|
||||
return new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(local)
|
||||
.dictionaryTypes(dictionary.getTypes())
|
||||
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream.concat(entities
|
||||
.stream(), hintsPerSectionNumber.get(sectionNumber).stream())
|
||||
.collect(Collectors.toSet()) : entities)
|
||||
.text(searchableText.getAsStringWithLinebreaks())
|
||||
.searchText(searchableText.toString())
|
||||
.headline(paragraph.getHeadline())
|
||||
.sectionNumber(sectionNumber.intValue())
|
||||
.searchableText(searchableText)
|
||||
.dictionary(dictionary)
|
||||
.build(), searchableText);
|
||||
}
|
||||
|
||||
|
||||
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
|
||||
Dictionary dictionary, boolean local) {
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user