diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java index 0c5d5e59..5f92dde0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java @@ -10,7 +10,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import lombok.Data; import lombok.NoArgsConstructor; - @Data @NoArgsConstructor public class Paragraph { @@ -18,10 +17,12 @@ public class Paragraph { private List pageBlocks = new ArrayList<>(); private String headline; - public SearchableText getSearchableText(){ + + public SearchableText getSearchableText() { + SearchableText searchableText = new SearchableText(); pageBlocks.forEach(block -> { - if(block instanceof TextBlock){ + if (block instanceof TextBlock) { searchableText.addAll(((TextBlock) block).getSequences()); } }); @@ -29,14 +30,15 @@ public class Paragraph { } - public List getTables(){ + public List
getTables() { + List
tables = new ArrayList<>(); pageBlocks.forEach(block -> { - if(block instanceof Table){ + if (block instanceof Table) { tables.add((Table) block); } }); return tables; } -} +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java index 36404fbf..0cbdfcc0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java @@ -5,43 +5,45 @@ import java.util.Map; import lombok.Getter; -/** - * - */ public class StringFrequencyCounter { @Getter - Map countPerValue = new HashMap<>(); + private final Map countPerValue = new HashMap<>(); - public void add(String value){ - if(!countPerValue.containsKey(value)){ + + public void add(String value) { + + if (!countPerValue.containsKey(value)) { countPerValue.put(value, 1); } else { countPerValue.put(value, countPerValue.get(value) + 1); } } - public void addAll(Map otherCounter){ - for(Map.Entry entry: otherCounter.entrySet()){ - if(countPerValue.containsKey(entry.getKey())){ - countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey())+ entry.getValue()); + + public void addAll(Map otherCounter) { + + for (Map.Entry entry : otherCounter.entrySet()) { + if (countPerValue.containsKey(entry.getKey())) { + countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue()); } else { countPerValue.put(entry.getKey(), entry.getValue()); } } } - public String getMostPopular(){ + + public String getMostPopular() { + Map.Entry mostPopular = null; - for(Map.Entry entry: countPerValue.entrySet()){ - if(mostPopular == null){ + for (Map.Entry entry : countPerValue.entrySet()) { + if (mostPopular == null) { mostPopular = entry; - } else if(entry.getValue() > mostPopular.getValue()){ + } else if (entry.getValue() > mostPopular.getValue()) { mostPopular = entry; } } return mostPopular != null ? mostPopular.getKey() : null; } - -} +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java index 0a3240bf..d4b83409 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java @@ -29,20 +29,16 @@ public class BlockificationService { float minX = 1000, maxX = 0, minY = 1000, maxY = 0; TextPositionSequence prev = null; - for (TextPositionSequence word : textPositions) { boolean lineSeparation = minY - word.getY2() > word.getHeight() * 1.25; boolean startFromTop = word.getY1() > maxY + word.getHeight(); - if (prev != null && - (lineSeparation - || startFromTop - || word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), verticalRulingLines) - || word.getRotation() == 0 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), horizontalRulingLines) - || word.getRotation() == 90 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), horizontalRulingLines) - || word.getRotation() == 90 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), verticalRulingLines) - )) { + if (prev != null && (lineSeparation || startFromTop || word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word + .getX1(), word.getY1(), verticalRulingLines) || word.getRotation() == 0 && isSplittedByRuling(minX, minY, word + .getX1(), word.getY2(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(maxX, minY, word + .getX1(), word.getY1(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(minX, minY, word + .getX1(), word.getY2(), verticalRulingLines))) { TextBlock cb1 = buildTextBlock(chunkWords); chunkBlockList1.add(cb1); @@ -100,11 +96,12 @@ public class BlockificationService { styleFrequencyCounter.add(wordBlock.getFontStyle()); if (textBlock == null) { - textBlock = new TextBlock(wordBlock.getX1(), wordBlock.getX2(), wordBlock.getY1(), wordBlock.getY2(), wordBlockList, wordBlock.getRotation()); + textBlock = new TextBlock(wordBlock.getX1(), wordBlock.getX2(), wordBlock.getY1(), wordBlock.getY2(), wordBlockList, wordBlock + .getRotation()); } else { TextBlock spatialEntity = textBlock.union(wordBlock); - textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), - spatialEntity.getWidth(), spatialEntity.getHeight()); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity + .getHeight()); } } @@ -122,6 +119,7 @@ public class BlockificationService { private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines) { + for (Ruling ruling : rulingLines) { if (ruling.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { return true; @@ -133,7 +131,6 @@ public class BlockificationService { public Rectangle calculateBodyTextFrame(List pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) { - float minX = 10000; float maxX = -100; float minY = 10000; @@ -147,7 +144,6 @@ public class BlockificationService { for (AbstractTextContainer container : page.getTextBlocks()) { - if (container instanceof TextBlock) { TextBlock textBlock = (TextBlock) container; if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) { @@ -179,16 +175,15 @@ public class BlockificationService { } } - if (container instanceof Table) { Table table = (Table) container; for (List row : table.getRows()) { - for (Cell column : row) { + for (Cell cell : row) { - if (column == null || column.getTextBlocks() == null) { + if (cell == null || cell.getTextBlocks() == null) { continue; } - for (TextBlock textBlock : column.getTextBlocks()) { + for (TextBlock textBlock : cell.getTextBlocks()) { if (textBlock.getMinX() < minX) { minX = textBlock.getMinX(); } @@ -211,5 +206,4 @@ public class BlockificationService { return new Rectangle(minY, minX, maxX - minX, maxY - minY); } - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index f4195f6a..a8207e08 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -8,10 +8,9 @@ import java.util.regex.Pattern; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; -@SuppressWarnings("all") public class SearchableText { - private List sequences = new ArrayList<>(); + private final List sequences = new ArrayList<>(); public void add(TextPositionSequence textPositionSequence) { @@ -26,6 +25,7 @@ public class SearchableText { } + @SuppressWarnings("checkstyle:ModifiedControlVariable") public List getSequences(String searchString, boolean caseInsensitive) { String normalizedSearchString; @@ -163,7 +163,7 @@ public class SearchableText { return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()) .replaceAll("\n", " ") - .replaceAll(" ", " "); + .replaceAll(" {2}", " "); } @@ -187,4 +187,4 @@ public class SearchableText { return sb.append("\n").toString(); } -} +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 736ac223..dd6f9419 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -3,15 +3,19 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import java.util.ArrayList; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.regex.Pattern; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import lombok.Builder; import lombok.Data; +import lombok.extern.slf4j.Slf4j; @Data +@Slf4j @Builder public class Section { @@ -27,6 +31,8 @@ public class Section { private int sectionNumber; + private Map tabularData; + public boolean contains(String type) { @@ -71,7 +77,7 @@ public class Section { if (values != null) { for (String value : values) { if (StringUtils.isNotBlank(value)) { - Set found = findEntity(value.trim(), asType); + Set found = findEntities(value.trim(), asType); entities.addAll(found); } } @@ -95,8 +101,8 @@ public class Section { if (values != null) { for (String value : values) { - if (value != null && StringUtils.isNotBlank(value)) { - Set found = findEntity(value.trim(), asType); + if (StringUtils.isNotBlank(value)) { + Set found = findEntities(value.trim(), asType); entities.addAll(found); } } @@ -113,7 +119,7 @@ public class Section { } - private Set findEntity(String value, String asType) { + private Set findEntities(String value, String asType) { Set found = new HashSet<>(); @@ -154,4 +160,25 @@ public class Section { return entities; } -} + + public void highlightCell(String cellHeader, int ruleNumber) { + + String value = tabularData.get(cellHeader); + if (value == null) { + log.warn("Could not find any data for {}.", cellHeader); + } else { + Set found = findEntities(value, "must_redact"); + if (CollectionUtils.isEmpty(found)) { + log.warn("Could not identify value {} in row.", value); + } else { + Entity entity = found.iterator().next(); + entity.setRedaction(false); + entity.setMatchedRule(ruleNumber); + entity.setRedactionReason(cellHeader); + entities.add(entity); + } + } + + } + +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 1a023459..eef620e1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -8,6 +8,8 @@ import java.util.Map; import java.util.Set; import java.util.regex.Pattern; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Service; import com.iqser.red.service.redaction.v1.server.classification.model.Document; @@ -44,24 +46,30 @@ public class EntityRedactionService { List
tables = paragraph.getTables(); for (Table table : tables) { + List metadata = table.getHeaders(); for (List row : table.getRows()) { SearchableText searchableRow = new SearchableText(); - for (Cell column : row) { - if (column == null || column.getTextBlocks() == null) { + List cellValues = new ArrayList<>(); + for (Cell cell : row) { + if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks())) { + cellValues.add(null); continue; } - for (TextBlock textBlock : column.getTextBlocks()) { + cellValues.add(cell.getTextBlocks().get(0).getText()); + for (TextBlock textBlock : cell.getTextBlocks()) { searchableRow.addAll(textBlock.getSequences()); } } Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber); + Map tabularData = toMap(metadata, cellValues); Section analysedRowSection = droolsExecutionService.executeRules(Section.builder() .entities(rowEntities) .text(searchableRow.getAsStringWithLinebreaks()) .searchText(searchableRow.toString()) .headline(table.getHeadline()) .sectionNumber(sectionNumber) + .tabularData(tabularData) .build()); documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), searchableRow)); @@ -93,7 +101,8 @@ public class EntityRedactionService { for (Map.Entry> entry : sequenceOnPage.entrySet()) { classifiedDoc.getEntities() .computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) - .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry + .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), + entity.getRedactionReason(), entry .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber())); } } @@ -101,6 +110,21 @@ public class EntityRedactionService { } + private Map toMap(List keys, List values) { + + if (keys.size() != values.size()) { + throw new RuntimeException("Cannot merge lists of unequal size."); + } + Map result = new HashMap<>(); + for (int i = 0; i < keys.size(); i++) { + result.put(keys.get(i), values.get(i)); + } + + return result; + + } + + private Set clearAndFindPositions(Set entities, SearchableText text) { removeEntitiesContainedInLarger(entities); @@ -119,12 +143,14 @@ public class EntityRedactionService { private Set findEntities(SearchableText searchableText, String headline, int sectionNumber) { + Set found = new HashSet<>(); + if (StringUtils.isEmpty(searchableText.toString()) && StringUtils.isEmpty(headline)) { + return found; + } + String inputString = searchableText.toString(); String lowercaseInputString = inputString.toLowerCase(); - - Set found = new HashSet<>(); for (Map.Entry> entry : dictionaryService.getDictionary().entrySet()) { - if (dictionaryService.getCaseInsensitiveTypes().contains(entry.getKey())) { found.addAll(find(lowercaseInputString, entry.getValue(), entry.getKey(), headline, sectionNumber)); } else { @@ -151,7 +177,8 @@ public class EntityRedactionService { if (startIndex > -1 && (startIndex == 0 || Character.isWhitespace(inputString.charAt(startIndex - 1)) || isSeparator(inputString .charAt(startIndex - 1))) && (stopIndex == inputString.length() || isSeparator(inputString.charAt(stopIndex)))) { - found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, headline, sectionNumber)); + found.add(new Entity(inputString.substring(startIndex, stopIndex), type, startIndex, stopIndex, + headline, sectionNumber)); } } while (startIndex > -1); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 9b402fcd..9caff395 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -29,7 +29,6 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @Service @RequiredArgsConstructor -@SuppressWarnings("PMD") public class PdfSegmentationService { private final RulingCleaningService rulingCleaningService; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 04a45e91..8d005ccd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -4,6 +4,8 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Service; import com.iqser.red.service.redaction.v1.server.classification.model.Document; @@ -14,7 +16,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; @Service -@SuppressWarnings("all") public class SectionsBuilderService { public void buildSections(Document document) { @@ -25,6 +26,7 @@ public class SectionsBuilderService { AbstractTextContainer prev = null; String lastHeadline = ""; + Table previousTable = null; for (Page page : document.getPages()) { for (AbstractTextContainer current : page.getTextBlocks()) { @@ -36,32 +38,30 @@ public class SectionsBuilderService { current.setPage(page.getPageNumber()); if (prev != null && current.getClassification().startsWith("H ") || !document.isHeadlines()) { - - Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline); + Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline, previousTable); chunkBlock.setHeadline(lastHeadline); lastHeadline = current.getText(); + if (CollectionUtils.isNotEmpty(chunkBlock.getTables())) { + previousTable = chunkBlock.getTables().get(0); + } chunkBlockList.add(chunkBlock); chunkWords = new ArrayList<>(); - } chunkWords.add(current); - prev = current; } } - Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline); - if (chunkBlock != null) { - chunkBlockList.add(chunkBlock); - chunkBlock.setHeadline(lastHeadline); - } + Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline, previousTable); + chunkBlock.setHeadline(lastHeadline); + chunkBlockList.add(chunkBlock); document.setParagraphs(chunkBlockList); } - private Paragraph buildTextBlock(List wordBlockList, String lastHeadline) { + private Paragraph buildTextBlock(List wordBlockList, String lastHeadline, Table previousTable) { Paragraph paragraph = new Paragraph(); TextBlock textBlock = null; @@ -76,19 +76,26 @@ public class SectionsBuilderService { AbstractTextContainer container = itty.next(); if (container instanceof Table) { + Table table = (Table) container; splitByTable = true; - if (previous != null && previous instanceof TextBlock && previous.getText().startsWith("Table ")) { - ((Table) container).setHeadline(previous.getText()); + if (previous != null && previous.getText().startsWith("Table ")) { + table.setHeadline(previous.getText()); } else { - ((Table) container).setHeadline("Table in: " + lastHeadline); + table.setHeadline("Table in: " + lastHeadline); + } + // Distribute header information for subsequent tables + if (previousTable != null && hasInvalidHeaderInformation(table) && hasValidHeaderInformation(previousTable) && + (previousTable.isVerticalHeader() && previousTable.getRowCount() == table.getRowCount() || + previousTable.getColCount() == table.getColCount())) { + table.setHeaders(previousTable.getHeaders()); } if (textBlock != null && !alreadyAdded) { paragraph.getPageBlocks().add(textBlock); alreadyAdded = true; } - paragraph.getPageBlocks().add(container); + paragraph.getPageBlocks().add(table); continue; } @@ -125,4 +132,24 @@ public class SectionsBuilderService { return paragraph; } -} + + private boolean hasValidHeaderInformation(Table table) { + + return !hasInvalidHeaderInformation(table); + } + + + private boolean hasInvalidHeaderInformation(Table table) { + + if (CollectionUtils.isEmpty(table.getHeaders())) { + return true; + } + if (table.getHeaders().stream().anyMatch(StringUtils::isEmpty)) { + return true; + } + + return false; + + } + +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java index 6a076019..9342533b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java @@ -16,11 +16,17 @@ public class Cell extends Rectangle { private List textBlocks = new ArrayList<>(); + public Cell(Point2D topLeft, Point2D bottomRight) { - super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY())); + + super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight + .getY() - topLeft.getY())); } + public void addTextBlock(TextBlock textBlock) { + textBlocks.add(textBlock); } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/RectangleSpatialIndex.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/RectangleSpatialIndex.java index fc73610a..79f08ec4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/RectangleSpatialIndex.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/RectangleSpatialIndex.java @@ -8,25 +8,28 @@ import org.locationtech.jts.index.strtree.STRtree; import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; - @SuppressWarnings("all") public class RectangleSpatialIndex { - private final STRtree si = new STRtree(); private final List rectangles = new ArrayList<>(); + public void add(T te) { + rectangles.add(te); si.insert(new Envelope(te.getLeft(), te.getRight(), te.getBottom(), te.getTop()), te); } - - public List contains(Rectangle r) { - List intersection = si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom())); + + + public List contains(Rectangle rectangle) { + + List intersection = si.query(new Envelope(rectangle.getLeft(), rectangle.getRight(), rectangle.getTop(), rectangle + .getBottom())); List rv = new ArrayList(); - for (T ir: intersection) { - if (r.contains(ir)) { + for (T ir : intersection) { + if (rectangle.contains(ir)) { rv.add(ir); } } @@ -34,18 +37,22 @@ public class RectangleSpatialIndex { Utils.sort(rv, Rectangle.ILL_DEFINED_ORDER); return rv; } - + + public List intersects(Rectangle r) { + List rv = si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom())); return rv; } - + + /** * Minimum bounding box of all the Rectangles contained on this RectangleSpatialIndex - * + * * @return a Rectangle */ public Rectangle getBounds() { + return Rectangle.boundingBoxOf(rectangles); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java index 20f1fbd8..1260bfd8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java @@ -8,32 +8,45 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; +import java.util.stream.Collectors; + +import org.apache.commons.collections4.CollectionUtils; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; import lombok.Getter; import lombok.Setter; +import lombok.extern.slf4j.Slf4j; -@SuppressWarnings("all") +@Slf4j public class Table extends AbstractTextContainer { private final TreeMap cells = new TreeMap<>(); - private RectangleSpatialIndex si = new RectangleSpatialIndex<>(); + private final RectangleSpatialIndex si = new RectangleSpatialIndex<>(); @Getter @Setter private String headline; @Getter - private int rowCount = 0; + private int rowCount; + @Getter - private int colCount = 0; + private int colCount; - private int rotation = 0; + private final int rotation; - private List> memoizedRows = null; + private List> rows; + + @Getter + @Setter + private List headers; + + @Getter + private boolean verticalHeader; public Table(List cells, Rectangle area, int rotation) { @@ -47,16 +60,87 @@ public class Table extends AbstractTextContainer { } + public List> getRows() { - if (memoizedRows == null) { - memoizedRows = computeRows(); + if (rows == null) { + rows = computeRows(); + headers = computeHeaders(); } - return memoizedRows; + return rows; } + + /** + * Detect header cells (either first row or first column): + * Column is marked as header if cell text is bold and row cell text is not bold. + * Defaults to row. + */ + private List computeHeaders() { + + boolean allBold = true; + List rowCells = rows.get(0); + for (Cell cell : rowCells) { + if (cell == null || CollectionUtils.isEmpty(cell.getTextBlocks()) || + !cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) { + allBold = false; + break; + } + } + if (!allBold) { + allBold = true; + List firstColCells = new ArrayList<>(); + for (List row : rows) { + Cell firstInRow = row.get(0); + if (firstInRow == null || CollectionUtils.isEmpty(firstInRow.getTextBlocks()) || + !firstInRow.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) { + allBold = false; + break; + } + firstColCells.add(firstInRow); + } + if (allBold) { + log.info("Headers are in first column"); + verticalHeader = true; + return firstColCells.stream().map(cell -> { + if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) { + return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText()) + .replaceAll("\n", " ") + .replaceAll(" ", " "); + } else { + return null; + } + }).collect(Collectors.toList()); + } else { + log.info("Headers are defaulted in first row."); + return rowCells.stream().map(cell -> { + if (cell != null && CollectionUtils.isNotEmpty(cell.getTextBlocks())) { + return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText()) + .replaceAll("\n", " ") + .replaceAll(" ", " "); + } else { + return null; + } + }).collect(Collectors.toList()); + } + } else { + log.info("Headers are in first row."); + return rowCells.stream().map(cell -> { + if (CollectionUtils.isNotEmpty(cell.getTextBlocks())) { + return TextNormalizationUtilities.removeHyphenLineBreaks(cell.getTextBlocks().get(0).getText()) + .replaceAll("\n", " ") + .replaceAll(" ", " "); + } else { + return null; + } + }).collect(Collectors.toList()); + } + + } + + private List> computeRows() { List> rows = new ArrayList<>(); @@ -93,7 +177,8 @@ public class Table extends AbstractTextContainer { } - public void add(Cell chunk, int row, int col) { + + private void add(Cell chunk, int row, int col) { rowCount = Math.max(rowCount, row + 1); colCount = Math.max(colCount, col + 1); @@ -103,6 +188,7 @@ public class Table extends AbstractTextContainer { } + private void addCells(List cells) { if (cells.isEmpty()) { @@ -131,14 +217,9 @@ public class Table extends AbstractTextContainer { while (rowCells.hasNext()) { Cell cell = rowCells.next(); if (i > 0) { - List> others = rowsOfCells( - si.contains( - new Rectangle(cell.getBottom(), - si.getBounds().getLeft(), - cell.getLeft() - si.getBounds().getLeft() + 1, - si.getBounds().getBottom() - cell.getBottom() - ) - )); + List> others = rowsOfCells(si.contains(new Rectangle(cell.getBottom(), si.getBounds() + .getLeft(), cell.getLeft() - si.getBounds().getLeft() + 1, si.getBounds().getBottom() - cell + .getBottom()))); for (List r : others) { jumpToColumn = Math.max(jumpToColumn, r.size()); @@ -158,7 +239,9 @@ public class Table extends AbstractTextContainer { } } + private static List> rowsOfCells(List cells) { + Cell c; float lastTop; List> rv = new ArrayList<>(); @@ -168,19 +251,10 @@ public class Table extends AbstractTextContainer { return rv; } - Collections.sort(cells, new Comparator() { - @Override - public int compare(Cell arg0, Cell arg1) { - return Double.compare(arg0.getLeft(), arg1.getLeft()); - } - }); + cells.sort(Comparator.comparingDouble(Rectangle::getLeft)); - Collections.sort(cells, Collections.reverseOrder(new Comparator() { - @Override - public int compare(Cell arg0, Cell arg1) { - return Float.compare(Utils.round(arg0.getBottom(), 2), Utils.round(arg1.getBottom(),2)); - } - })); + cells.sort(Collections.reverseOrder((arg0, arg1) -> Float.compare(Utils.round(arg0.getBottom(), 2), Utils.round(arg1 + .getBottom(), 2)))); Iterator iter = cells.iterator(); c = iter.next(); @@ -201,6 +275,7 @@ public class Table extends AbstractTextContainer { return rv; } + @Override public String getText() { @@ -237,6 +312,7 @@ public class Table extends AbstractTextContainer { return sb.toString(); } + public String getTextAsHtml() { StringBuilder sb = new StringBuilder(); @@ -270,22 +346,30 @@ public class Table extends AbstractTextContainer { return sb.toString(); } - class CellPosition implements Comparable { + + static class CellPosition implements Comparable { CellPosition(int row, int col) { + this.row = row; this.col = col; } - final int row, col; + + final int row; + final int col; + @Override public int hashCode() { + return row + 101 * col; } + @Override public boolean equals(Object obj) { + if (this == obj) { return true; } @@ -299,10 +383,12 @@ public class Table extends AbstractTextContainer { return row == other.row && col == other.col; } + @Override public int compareTo(CellPosition other) { - int rowdiff = row - other.row; - return rowdiff != 0 ? rowdiff : col - other.col; + + int rowDiff = row - other.row; + return rowDiff != 0 ? rowDiff : col - other.col; } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index e5d41822..9e2771ae 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -99,7 +99,7 @@ public class RedactionIntegrationTest { @Before - public void stubRulesClient() { + public void stubClients() { when(rulesClient.getVersion()).thenReturn(0L); when(rulesClient.getRules()).thenReturn(new RulesResponse(RULES)); @@ -241,6 +241,27 @@ public class RedactionIntegrationTest { System.out.println("numberOfPages: " + result.getNumberOfPages()); } + @Test + public void testTableRedaction() throws IOException { + + long start = System.currentTimeMillis(); + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf"); + + RedactionRequest request = RedactionRequest.builder() + .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + .build(); + + RedactionResult result = redactionController.redact(request); + + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Redacted.pdf")) { + fileOutputStream.write(result.getDocument()); + } + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + System.out.println("numberOfPages: " + result.getNumberOfPages()); + } + @Test public void classificationTest() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index 3856b882..9d2471f1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -1,36 +1,91 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.when; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.Set; +import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.Test; import org.junit.runner.RunWith; +import org.kie.api.KieServices; +import org.kie.api.builder.KieBuilder; +import org.kie.api.builder.KieFileSystem; +import org.kie.api.builder.KieModule; import org.kie.api.runtime.KieContainer; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.context.TestConfiguration; import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.context.annotation.Bean; +import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit4.SpringRunner; +import com.iqser.red.service.configuration.v1.api.model.DefaultColor; +import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse; +import com.iqser.red.service.configuration.v1.api.model.RulesResponse; +import com.iqser.red.service.configuration.v1.api.model.TypeResponse; +import com.iqser.red.service.configuration.v1.api.model.TypeResult; +import com.iqser.red.service.redaction.v1.model.RedactionRequest; +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; +import com.iqser.red.service.redaction.v1.server.client.RulesClient; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; +import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; @RunWith(SpringRunner.class) @SpringBootTest public class EntityRedactionServiceTest { - @MockBean - private KieContainer kieContainer; + private static final String DEFAULT_RULES = loadFromClassPath("drools/rules.drl"); + private static final String NAME_CODE = "name"; + private static final String ADDRESS_CODE = "address"; @MockBean - private DroolsExecutionService droolsExecutionService; + private DictionaryClient dictionaryClient; @MockBean - private DictionaryService dictionaryService; + private RulesClient rulesClient; @Autowired private EntityRedactionService entityRedactionService; + @Autowired + private PdfSegmentationService pdfSegmentationService; + + @TestConfiguration + public static class RedactionIntegrationTestConfiguration { + + @Bean + public KieContainer kieContainer() { + + KieServices kieServices = KieServices.Factory.get(); + + KieFileSystem kieFileSystem = kieServices.newKieFileSystem(); + InputStream input = new ByteArrayInputStream(DEFAULT_RULES.getBytes(StandardCharsets.UTF_8)); + kieFileSystem.write("src/test/resources/drools/rules.drl", kieServices.getResources() + .newInputStreamResource(input)); + KieBuilder kieBuilder = kieServices.newKieBuilder(kieFileSystem); + kieBuilder.buildAll(); + KieModule kieModule = kieBuilder.getKieModule(); + + return kieServices.newKieContainer(kieModule.getReleaseId()); + } + + } + @Test public void testNestedEntitiesRemoval() { @@ -47,4 +102,74 @@ public class EntityRedactionServiceTest { } + + @Test + public void testTableRedaction() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Single Table.pdf"); + + RedactionRequest redactionRequest = RedactionRequest.builder() + .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + .build(); + + String tableRules = "package drools\n" + + "\n" + + "import com.iqser.red.service.redaction.v1.server.redaction.model.Section\n" + + "\n" + + "global Section section\n" + + "rule \"9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study\"\n" + + " when\n" + + " Section(tabularData != null && tabularData.size() > 0\n" + + " && tabularData.containsKey(\"Vertebrate study Y/N\")\n" + + " && tabularData.get(\"Vertebrate study Y/N\").equals(\"Y\")\n" + + " )\n" + + " then\n" + + " section.redact(\"name\", 9, \"Redacted because row is a vertebrate study\");\n" + + " section.redact(\"address\", 9, \"Redacted because rows is a vertebrate study\");\n" + + " section.highlightCell(\"Vertebrate study Y/N\", 9);\n" + + " end"; + when(rulesClient.getVersion()).thenReturn(1L); + when(rulesClient.getRules()).thenReturn(new RulesResponse(tableRules)); + TypeResponse typeResponse = TypeResponse.builder() + .types(Arrays.asList( + TypeResult.builder().type(NAME_CODE).color(new float[]{1, 1, 0}).build(), + TypeResult.builder().type(ADDRESS_CODE).color(new float[]{0, 1, 1}).build())) + .build(); + when(dictionaryClient.getAllTypes()).thenReturn(typeResponse); + DictionaryResponse dictionaryResponse = DictionaryResponse.builder() + .entries(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")) + .build(); + when(dictionaryClient.getDictionaryForType(NAME_CODE)).thenReturn(dictionaryResponse); + DictionaryResponse addressResponse = DictionaryResponse.builder() + .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")) + .build(); + when(dictionaryClient.getDictionaryForType(ADDRESS_CODE)).thenReturn(addressResponse); + when(dictionaryClient.getDefaultColor()).thenReturn(new DefaultColor()); + try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(redactionRequest.getDocument()))) { + Document classifiedDoc = pdfSegmentationService.parseDocument(pdDocument); + entityRedactionService.processDocument(classifiedDoc); + assertThat(classifiedDoc.getEntities()).hasSize(1); // one page + assertThat(classifiedDoc.getEntities().get(1)).hasSize(5); // 4 out of 5 entities recognized on page 1 + } + } + + + private static String loadFromClassPath(String path) { + + URL resource = ResourceLoader.class.getClassLoader().getResource(path); + if (resource == null) { + throw new IllegalArgumentException("could not load classpath resource: drools/rules.drl"); + } + try (BufferedReader br = new BufferedReader(new InputStreamReader(resource.openStream(), StandardCharsets.UTF_8))) { + StringBuilder sb = new StringBuilder(); + String str; + while ((str = br.readLine()) != null) { + sb.append(str).append("\n"); + } + return sb.toString(); + } catch (IOException e) { + throw new IllegalArgumentException("could not load classpath resource: " + path, e); + } + } + } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 20f24015..8d13e9f0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -99,3 +99,14 @@ rule "8: Redact contact information, if Producer is found" section.redactBetween("No:", "Fax", "address", 8, "Producer was found"); end +rule "9: Redact Authors and Addresses in Reference Table, if it is a Vertebrate study" + when + Section(tabularData != null && tabularData.size() > 0 + && tabularData.containsKey("Vertebrate study Y/N") + && tabularData.get("Vertebrate study Y/N").equals("Y") + ) + then + section.redact("name", 9, "Redacted because row is a vertebrate study"); + section.redact("address", 9, "Redacted because rows is a vertebrate study"); + section.highlightCell("Vertebrate study Y/N", 9); + end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Single Table.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Single Table.pdf new file mode 100644 index 00000000..9a11ca95 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/Single Table.pdf differ