From 599c7bd6e474b61c32f92d6c67e53a80de13a2e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Tue, 5 Jan 2021 12:14:42 +0100 Subject: [PATCH] Tables with only 2 column are treated as on text --- .../service/EntityRedactionService.java | 204 +++++++++++------- 1 file changed, 132 insertions(+), 72 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index b599fc2f..16f20cd1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -6,6 +6,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -100,85 +101,22 @@ public class EntityRedactionService { boolean local, Map> hintsPerSectionNumber) { Set documentEntities = new HashSet<>(); - int sectionNumber = 1; + + AtomicInteger sectionNumber = new AtomicInteger(1); List sectionSearchableTextPairs = new ArrayList<>(); for (Paragraph paragraph : classifiedDoc.getParagraphs()) { - SearchableText searchableText = paragraph.getSearchableText(); - List tables = paragraph.getTables(); - for (Table table : tables) { - for (List row : table.getRows()) { - SearchableText searchableRow = new SearchableText(); - Map tabularData = new HashMap<>(); - int start = 0; - List cellStarts = new ArrayList<>(); - for (Cell cell : row) { - if (CollectionUtils.isEmpty(cell.getTextBlocks())) { - continue; - } - addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber); - int cellStart = start; - cell.getHeaderCells().forEach(headerCell -> { - StringBuilder headerBuilder = new StringBuilder(); - headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); - String headerName = headerBuilder.toString() - .replaceAll("\n", "") - .replaceAll(" ", "") - .replaceAll("-", ""); - tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); - }); - - for (TextBlock textBlock : cell.getTextBlocks()) { - // TODO avoid cell overlap merging. - searchableRow.addAll(textBlock.getSequences()); - } - cellStarts.add(cellStart); - start = start + cell.toString().trim().length() + 1; - } - Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary, local); - surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts); - - sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() - .isLocal(local) - .dictionaryTypes(dictionary.getTypes()) - .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream - .concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber).stream()) - .collect(Collectors.toSet()) : rowEntities) - .text(searchableRow.getAsStringWithLinebreaks()) - .searchText(searchableRow.toString()) - .headline(table.getHeadline()) - .sectionNumber(sectionNumber) - .tabularData(tabularData) - .searchableText(searchableRow) - .dictionary(dictionary) - .build(), searchableRow)); - - sectionNumber++; + if (table.getColCount() == 2) { + sectionSearchableTextPairs.addAll(processTableAsOneText(table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber)); + } else { + sectionSearchableTextPairs.addAll(processTablePerRow(table, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber)); } - sectionNumber++; + sectionNumber.incrementAndGet(); } - - addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber); - Set entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary, local); - surroundingWordsService.addSurroundingText(entities, searchableText, dictionary); - - sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() - .isLocal(local) - .dictionaryTypes(dictionary.getTypes()) - .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream - .concat(entities.stream(), hintsPerSectionNumber.get(sectionNumber).stream()) - .collect(Collectors.toSet()) : entities) - .text(searchableText.getAsStringWithLinebreaks()) - .searchText(searchableText.toString()) - .headline(paragraph.getHeadline()) - .sectionNumber(sectionNumber) - .searchableText(searchableText) - .dictionary(dictionary) - .build(), searchableText)); - - sectionNumber++; + sectionSearchableTextPairs.add(processText(paragraph, manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber)); + sectionNumber.incrementAndGet(); } sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { @@ -214,6 +152,128 @@ public class EntityRedactionService { } + private List processTablePerRow(Table table, ManualRedactions manualRedactions, + AtomicInteger sectionNumber, Dictionary dictionary, boolean local, + Map> hintsPerSectionNumber) { + + List sectionSearchableTextPairs = new ArrayList<>(); + for (List row : table.getRows()) { + SearchableText searchableRow = new SearchableText(); + Map tabularData = new HashMap<>(); + int start = 0; + List cellStarts = new ArrayList<>(); + for (Cell cell : row) { + if (CollectionUtils.isEmpty(cell.getTextBlocks())) { + continue; + } + addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue()); + int cellStart = start; + cell.getHeaderCells().forEach(headerCell -> { + StringBuilder headerBuilder = new StringBuilder(); + headerCell.getTextBlocks().forEach(textBlock -> headerBuilder.append(textBlock.getText())); + String headerName = headerBuilder.toString() + .replaceAll("\n", "") + .replaceAll(" ", "") + .replaceAll("-", ""); + tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); + }); + + for (TextBlock textBlock : cell.getTextBlocks()) { + // TODO avoid cell overlap merging. + searchableRow.addAll(textBlock.getSequences()); + } + cellStarts.add(cellStart); + start = start + cell.toString().trim().length() + 1; + } + Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber.intValue(), dictionary, local); + surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts); + + sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() + .isLocal(local) + .dictionaryTypes(dictionary.getTypes()) + .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream + .concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream()) + .collect(Collectors.toSet()) : rowEntities) + .text(searchableRow.getAsStringWithLinebreaks()) + .searchText(searchableRow.toString()) + .headline(table.getHeadline()) + .sectionNumber(sectionNumber.intValue()) + .tabularData(tabularData) + .searchableText(searchableRow) + .dictionary(dictionary) + .build(), searchableRow)); + + sectionNumber.incrementAndGet(); + } + return sectionSearchableTextPairs; + } + + + private List processTableAsOneText(Table table, ManualRedactions manualRedactions, + AtomicInteger sectionNumber, Dictionary dictionary, + boolean local, + Map> hintsPerSectionNumber) { + + List sectionSearchableTextPairs = new ArrayList<>(); + SearchableText entireTableText = new SearchableText(); + for (List row : table.getRows()) { + for (Cell cell : row) { + if (CollectionUtils.isEmpty(cell.getTextBlocks())) { + continue; + } + for (TextBlock textBlock : cell.getTextBlocks()) { + entireTableText.addAll(textBlock.getSequences()); + } + addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue()); + } + } + + Set rowEntities = findEntities(entireTableText, table.getHeadline(), sectionNumber.intValue(), dictionary, local); + surroundingWordsService.addSurroundingText(rowEntities, entireTableText, dictionary); + + sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() + .isLocal(local) + .dictionaryTypes(dictionary.getTypes()) + .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber.intValue()) ? Stream.concat(rowEntities + .stream(), hintsPerSectionNumber.get(sectionNumber.intValue()).stream()) + .collect(Collectors.toSet()) : rowEntities) + .text(entireTableText.getAsStringWithLinebreaks()) + .searchText(entireTableText.toString()) + .headline(table.getHeadline()) + .sectionNumber(sectionNumber.intValue()) + .searchableText(entireTableText) + .dictionary(dictionary) + .build(), entireTableText)); + + return sectionSearchableTextPairs; + } + + + private SectionSearchableTextPair processText(Paragraph paragraph, ManualRedactions manualRedactions, + AtomicInteger sectionNumber, Dictionary dictionary, boolean local, + Map> hintsPerSectionNumber) { + + SearchableText searchableText = paragraph.getSearchableText(); + addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber.intValue()); + Set entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber.intValue(), dictionary, local); + surroundingWordsService.addSurroundingText(entities, searchableText, dictionary); + + return new SectionSearchableTextPair(Section.builder() + .isLocal(local) + .dictionaryTypes(dictionary.getTypes()) + .entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream.concat(entities + .stream(), hintsPerSectionNumber.get(sectionNumber).stream()) + .collect(Collectors.toSet()) : entities) + .text(searchableText.getAsStringWithLinebreaks()) + .searchText(searchableText.toString()) + .headline(paragraph.getHeadline()) + .sectionNumber(sectionNumber.intValue()) + .searchableText(searchableText) + .dictionary(dictionary) + .build(), searchableText); + } + + private Set findEntities(SearchableText searchableText, String headline, int sectionNumber, Dictionary dictionary, boolean local) {