diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java index cf6ced53..7077e98c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java @@ -27,11 +27,15 @@ public class SectionText { private boolean isTable; private String headline; + @Builder.Default private List sectionAreas = new ArrayList<>(); + @Builder.Default private Set images = new HashSet<>(); - + @Builder.Default private List textBlocks = new ArrayList<>(); + @Builder.Default private Map tabularData = new HashMap<>(); + @Builder.Default private List cellStarts = new ArrayList<>(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java index f7f63e07..4503f5d6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java @@ -194,7 +194,7 @@ public class SearchableText { } - public String buildString(List sequences) { + public static String buildString(List sequences) { StringBuilder sb = new StringBuilder(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java index 1d83a4b5..7034753e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java @@ -77,65 +77,48 @@ public class SectionTextBuilderService { private List processTablePerRow(Table table, AtomicInteger sectionNumber) { List sectionTexts = new ArrayList<>(); + boolean hasHeader = hasTableHeader(table); - boolean hasHeader = table.getRows() - .stream() - .anyMatch(row -> row.stream() - .anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty())); + for (List row : table.getRows()) { - for (int rowNum = 0; rowNum < table.getRows().size(); rowNum++) { - SearchableText searchableRow = new SearchableText(); + List textBlocks = new ArrayList<>(); + List areas = new ArrayList<>(); Map tabularData = new HashMap<>(); - int start = 0; - List cellStarts = new ArrayList<>(); - SectionText sectionText = new SectionText(); - for (int cellNum = 0; cellNum < table.getRows().get(rowNum).size(); cellNum++) { - Cell cell = table.getRows().get(rowNum).get(cellNum); + List startOffsets = new ArrayList<>(); + int startOffset = 0; + for (int cellNum = 0; cellNum < row.size(); cellNum++) { + + Cell cell = row.get(cellNum); if (CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; } - SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() - .get(0) - .getSequences() - .get(0) - .getPage(), null); - sectionText.getSectionAreas().add(sectionArea); - sectionText.getTextBlocks().addAll(cell.getTextBlocks()); - - int cellStart = start; + SectionArea sectionArea = getSectionArea(cell); + areas.add(sectionArea); if (!cell.isHeaderCell()) { - if (hasHeader) { - cell.getHeaderCells().forEach(headerCell -> { - String headerName = buildHeaderName(headerCell); - sectionArea.setHeader(headerName); - tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); - }); - } else { - String headerName = buildHeaderName(table.getRows().get(0).get(cellNum)); - sectionArea.setHeader(headerName); - tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); - } + String headerName = getHeaderName(hasHeader, cell, table.getRows().get(0).get(cellNum)); + sectionArea.setHeader(headerName); + tabularData.put(headerName, new CellValue(cell.getTextBlocks(), startOffset)); } - for (TextBlock textBlock : cell.getTextBlocks()) { - // TODO avoid cell overlap merging. - searchableRow.addAll(textBlock.getSequences()); - } - cellStarts.add(cellStart); - start = start + cell.toString().trim().length() + 1; + textBlocks.addAll(cell.getTextBlocks()); + startOffsets.add(startOffset); + startOffset = startOffset + cell.toString().trim().length() + 1; } - sectionText.setText(searchableRow.toString()); - sectionText.setHeadline(table.getHeadline()); - sectionText.setSectionNumber(sectionNumber.intValue()); - sectionText.setTable(true); - sectionText.setTabularData(tabularData); - sectionText.setCellStarts(cellStarts); - sectionTexts.add(sectionText); + sectionTexts.add(SectionText.builder() + .text(getRowText(textBlocks)) + .headline(table.getHeadline()) + .sectionNumber(sectionNumber.intValue()) + .isTable(true) + .tabularData(tabularData) + .cellStarts(startOffsets) + .textBlocks(textBlocks) + .sectionAreas(areas) + .build()); sectionNumber.incrementAndGet(); } @@ -144,6 +127,41 @@ public class SectionTextBuilderService { } + public String getRowText(List rowTextBlocks) { + + return SearchableText.buildString(rowTextBlocks.stream() + .map(textBlock -> textBlock.getSequences()) + .flatMap(List::stream) + .collect(Collectors.toList())); + } + + + private boolean hasTableHeader(Table table) { + + return table.getRows() + .stream() + .anyMatch(row -> row.stream() + .anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty())); + } + + + private SectionArea getSectionArea(Cell cell) { + + return new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + .get(0) + .getSequences() + .get(0) + .getPage(), null); + } + + + private String getHeaderName(boolean hasHeader, Cell currentCell, Cell cellInFirstRow) { + + return hasHeader ? buildHeaderName(currentCell.getHeaderCells() + .get(currentCell.getHeaderCells().size() - 1)) : buildHeaderName(cellInFirstRow); + } + + private String buildHeaderName(Cell cell) { StringBuilder headerBuilder = new StringBuilder();