Let Tables know its headlines

This commit is contained in:
deiflaender 2020-07-27 15:33:25 +02:00
parent 135a715e22
commit 88e1c5c58e
3 changed files with 44 additions and 35 deletions

View File

@ -40,7 +40,6 @@ public class EntityRedactionService {
List<Table> tables = paragraph.getTables();
List<SearchableText> searchableRows = new ArrayList<>();
for (Table table : tables) {
for (List<Cell> row : table.getRows()) {
SearchableText searchableRow = new SearchableText();
@ -52,7 +51,23 @@ public class EntityRedactionService {
searchableRow.addAll(textBlock.getSequences());
}
}
searchableRows.add(searchableRow);
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline());
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
.entities(rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString())
.headline(table.getHeadline())
.build());
for (Entity entity : analysedRowSection.getEntities()) {
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
} else {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
}
}
documentEntities.addAll(analysedRowSection.getEntities());
}
}
@ -73,26 +88,6 @@ public class EntityRedactionService {
}
documentEntities.addAll(analysedSection.getEntities());
for (SearchableText searchableRow : searchableRows) {
Set<Entity> rowEntities = findEntities(searchableRow, "//TODO TableHeader");
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
.entities(rowEntities)
.text(searchableRow.getAsStringWithLinebreaks())
.searchText(searchableRow.toString())
.headline("//TODO TableHeader")
.build());
for (Entity entity : analysedRowSection.getEntities()) {
if (dictionaryService.getCaseInsensitiveTypes().contains(entity.getType())) {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), true));
} else {
entity.setPositionSequences(searchableRow.getSequences(entity.getWord(), false));
}
}
documentEntities.addAll(analysedRowSection.getEntities());
}
}
documentEntities.forEach(entity -> {

View File

@ -17,7 +17,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@SuppressWarnings("all")
public class SectionsBuilderService {
public void buildSections(Document document) {
List<AbstractTextContainer> chunkWords = new ArrayList<>();
@ -29,7 +28,8 @@ public class SectionsBuilderService {
for (Page page : document.getPages()) {
for (AbstractTextContainer current : page.getTextBlocks()) {
if (current.getClassification() == null || current.getClassification().equals("Header") || current.getClassification().equals("Footer")) {
if (current.getClassification() == null || current.getClassification()
.equals("Header") || current.getClassification().equals("Footer")) {
continue;
}
@ -37,7 +37,7 @@ public class SectionsBuilderService {
if (prev != null && current.getClassification().startsWith("H ") || !document.isHeadlines()) {
Paragraph chunkBlock = buildTextBlock(chunkWords);
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
lastHeadline = current.getText();
chunkBlockList.add(chunkBlock);
@ -51,7 +51,7 @@ public class SectionsBuilderService {
}
}
Paragraph chunkBlock = buildTextBlock(chunkWords);
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
if (chunkBlock != null) {
chunkBlockList.add(chunkBlock);
chunkBlock.setHeadline(lastHeadline);
@ -61,7 +61,7 @@ public class SectionsBuilderService {
}
private Paragraph buildTextBlock(List<AbstractTextContainer> wordBlockList) {
private Paragraph buildTextBlock(List<AbstractTextContainer> wordBlockList, String lastHeadline) {
Paragraph paragraph = new Paragraph();
TextBlock textBlock = null;
@ -70,17 +70,23 @@ public class SectionsBuilderService {
boolean splitByTable = false;
Iterator<AbstractTextContainer> itty = wordBlockList.iterator();
boolean alreadyAdded= false;
boolean alreadyAdded = false;
AbstractTextContainer previous = null;
while (itty.hasNext()) {
AbstractTextContainer container = itty.next();
if (container instanceof Table) {
splitByTable = true;
if (previous != null && previous instanceof TextBlock && previous.getText().startsWith("Table ")) {
((Table) container).setHeadline(previous.getText());
} else {
((Table) container).setHeadline("Table in: " + lastHeadline);
}
if (textBlock != null && !alreadyAdded) {
paragraph.getPageBlocks().add(textBlock);
alreadyAdded =true;
alreadyAdded = true;
}
paragraph.getPageBlocks().add(container);
continue;
@ -89,24 +95,28 @@ public class SectionsBuilderService {
TextBlock wordBlock = (TextBlock) container;
if (textBlock == null) {
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock
.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage());
} else if (splitByTable) {
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock
.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage());
alreadyAdded = false;
} else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) {
textBlock.setPage(pageBefore);
paragraph.getPageBlocks().add(textBlock);
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock
.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage());
} else {
TextBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(),
spatialEntity.getWidth(), spatialEntity.getHeight());
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity
.getHeight());
}
pageBefore = wordBlock.getPage();
splitByTable = false;
previous = container;
}
if (textBlock != null && !alreadyAdded) {
@ -115,5 +125,4 @@ public class SectionsBuilderService {
return paragraph;
}
}

View File

@ -13,6 +13,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import lombok.Getter;
import lombok.Setter;
@SuppressWarnings("all")
public class Table extends AbstractTextContainer {
@ -21,6 +22,10 @@ public class Table extends AbstractTextContainer {
private RectangleSpatialIndex<Cell> si = new RectangleSpatialIndex<>();
@Getter
@Setter
private String headline;
@Getter
private int rowCount = 0;
@Getter