diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 73c15013..ee1e5771 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -40,7 +40,6 @@ public class EntityRedactionService { List tables = paragraph.getTables(); - List searchableRows = new ArrayList<>(); for (Table table : tables) { for (List row : table.getRows()) { SearchableText searchableRow = new SearchableText(); @@ -52,7 +51,23 @@ public class EntityRedactionService { searchableRow.addAll(textBlock.getSequences()); } } - searchableRows.add(searchableRow); + Set rowEntities = findEntities(searchableRow, table.getHeadline()); + + Section analysedRowSection = droolsExecutionService.executeRules(Section.builder() + .entities(rowEntities) + .text(searchableRow.getAsStringWithLinebreaks()) + .searchText(searchableRow.toString()) + .headline(table.getHeadline()) + .build()); + + for (Entity entity : analysedRowSection.getEntities()) { + if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) { + entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true)); + } else { + entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false)); + } + } + documentEntities.addAll(analysedRowSection.getEntities()); } } @@ -73,26 +88,6 @@ public class EntityRedactionService { } documentEntities.addAll(analysedSection.getEntities()); - - for (SearchableText searchableRow : searchableRows) { - Set rowEntities = findEntities(searchableRow, "//TODO TableHeader"); - - Section analysedRowSection = droolsExecutionService.executeRules(Section.builder() - .entities(rowEntities) - .text(searchableRow.getAsStringWithLinebreaks()) - .searchText(searchableRow.toString()) - .headline("//TODO TableHeader") - .build()); - - for (Entity entity : analysedRowSection.getEntities()) { - if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) { - entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true)); - } else { - entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false)); - } - } - documentEntities.addAll(analysedRowSection.getEntities()); - } } documentEntities.forEach(entity -> { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 8c64ce56..04a45e91 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -17,7 +17,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; @SuppressWarnings("all") public class SectionsBuilderService { - public void buildSections(Document document) { List chunkWords = new ArrayList<>(); @@ -29,7 +28,8 @@ public class SectionsBuilderService { for (Page page : document.getPages()) { for (AbstractTextContainer current : page.getTextBlocks()) { - if (current.getClassification() == null || current.getClassification().equals("Header") || current.getClassification().equals("Footer")) { + if (current.getClassification() == null || current.getClassification() + .equals("Header") || current.getClassification().equals("Footer")) { continue; } @@ -37,7 +37,7 @@ public class SectionsBuilderService { if (prev != null && current.getClassification().startsWith("H ") || !document.isHeadlines()) { - Paragraph chunkBlock = buildTextBlock(chunkWords); + Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline); chunkBlock.setHeadline(lastHeadline); lastHeadline = current.getText(); chunkBlockList.add(chunkBlock); @@ -51,7 +51,7 @@ public class SectionsBuilderService { } } - Paragraph chunkBlock = buildTextBlock(chunkWords); + Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline); if (chunkBlock != null) { chunkBlockList.add(chunkBlock); chunkBlock.setHeadline(lastHeadline); @@ -61,7 +61,7 @@ public class SectionsBuilderService { } - private Paragraph buildTextBlock(List wordBlockList) { + private Paragraph buildTextBlock(List wordBlockList, String lastHeadline) { Paragraph paragraph = new Paragraph(); TextBlock textBlock = null; @@ -70,17 +70,23 @@ public class SectionsBuilderService { boolean splitByTable = false; Iterator itty = wordBlockList.iterator(); - boolean alreadyAdded= false; + boolean alreadyAdded = false; + AbstractTextContainer previous = null; while (itty.hasNext()) { AbstractTextContainer container = itty.next(); if (container instanceof Table) { splitByTable = true; + if (previous != null && previous instanceof TextBlock && previous.getText().startsWith("Table ")) { + ((Table) container).setHeadline(previous.getText()); + } else { + ((Table) container).setHeadline("Table in: " + lastHeadline); + } if (textBlock != null && !alreadyAdded) { paragraph.getPageBlocks().add(textBlock); - alreadyAdded =true; + alreadyAdded = true; } paragraph.getPageBlocks().add(container); continue; @@ -89,24 +95,28 @@ public class SectionsBuilderService { TextBlock wordBlock = (TextBlock) container; if (textBlock == null) { - textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation()); + textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock + .getSequences(), wordBlock.getRotation()); textBlock.setPage(wordBlock.getPage()); } else if (splitByTable) { - textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation()); + textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock + .getSequences(), wordBlock.getRotation()); textBlock.setPage(wordBlock.getPage()); alreadyAdded = false; } else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) { textBlock.setPage(pageBefore); paragraph.getPageBlocks().add(textBlock); - textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation()); + textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock + .getSequences(), wordBlock.getRotation()); textBlock.setPage(wordBlock.getPage()); } else { TextBlock spatialEntity = textBlock.union(wordBlock); - textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), - spatialEntity.getWidth(), spatialEntity.getHeight()); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity + .getHeight()); } pageBefore = wordBlock.getPage(); splitByTable = false; + previous = container; } if (textBlock != null && !alreadyAdded) { @@ -115,5 +125,4 @@ public class SectionsBuilderService { return paragraph; } - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java index f45c2d7c..20f1fbd8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java @@ -13,6 +13,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; import lombok.Getter; +import lombok.Setter; @SuppressWarnings("all") public class Table extends AbstractTextContainer { @@ -21,6 +22,10 @@ public class Table extends AbstractTextContainer { private RectangleSpatialIndex si = new RectangleSpatialIndex<>(); + @Getter + @Setter + private String headline; + @Getter private int rowCount = 0; @Getter