RED-3974: Refactored processTablePerRow

This commit is contained in:
deiflaender 2022-08-11 14:18:36 +02:00
parent f84a366328
commit cba81ce061
3 changed files with 68 additions and 46 deletions

View File

@ -27,11 +27,15 @@ public class SectionText {
private boolean isTable;
private String headline;
@Builder.Default
private List<SectionArea> sectionAreas = new ArrayList<>();
@Builder.Default
private Set<Image> images = new HashSet<>();
@Builder.Default
private List<TextBlock> textBlocks = new ArrayList<>();
@Builder.Default
private Map<String, CellValue> tabularData = new HashMap<>();
@Builder.Default
private List<Integer> cellStarts = new ArrayList<>();

View File

@ -194,7 +194,7 @@ public class SearchableText {
}
public String buildString(List<TextPositionSequence> sequences) {
public static String buildString(List<TextPositionSequence> sequences) {
StringBuilder sb = new StringBuilder();

View File

@ -77,65 +77,48 @@ public class SectionTextBuilderService {
private List<SectionText> processTablePerRow(Table table, AtomicInteger sectionNumber) {
List<SectionText> sectionTexts = new ArrayList<>();
boolean hasHeader = hasTableHeader(table);
boolean hasHeader = table.getRows()
.stream()
.anyMatch(row -> row.stream()
.anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty()));
for (List<Cell> row : table.getRows()) {
for (int rowNum = 0; rowNum < table.getRows().size(); rowNum++) {
SearchableText searchableRow = new SearchableText();
List<TextBlock> textBlocks = new ArrayList<>();
List<SectionArea> areas = new ArrayList<>();
Map<String, CellValue> tabularData = new HashMap<>();
int start = 0;
List<Integer> cellStarts = new ArrayList<>();
SectionText sectionText = new SectionText();
for (int cellNum = 0; cellNum < table.getRows().get(rowNum).size(); cellNum++) {
Cell cell = table.getRows().get(rowNum).get(cellNum);
List<Integer> startOffsets = new ArrayList<>();
int startOffset = 0;
for (int cellNum = 0; cellNum < row.size(); cellNum++) {
Cell cell = row.get(cellNum);
if (CollectionUtils.isEmpty(cell.getTextBlocks())) {
continue;
}
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage(), null);
sectionText.getSectionAreas().add(sectionArea);
sectionText.getTextBlocks().addAll(cell.getTextBlocks());
int cellStart = start;
SectionArea sectionArea = getSectionArea(cell);
areas.add(sectionArea);
if (!cell.isHeaderCell()) {
if (hasHeader) {
cell.getHeaderCells().forEach(headerCell -> {
String headerName = buildHeaderName(headerCell);
sectionArea.setHeader(headerName);
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
});
} else {
String headerName = buildHeaderName(table.getRows().get(0).get(cellNum));
sectionArea.setHeader(headerName);
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart));
}
String headerName = getHeaderName(hasHeader, cell, table.getRows().get(0).get(cellNum));
sectionArea.setHeader(headerName);
tabularData.put(headerName, new CellValue(cell.getTextBlocks(), startOffset));
}
for (TextBlock textBlock : cell.getTextBlocks()) {
// TODO avoid cell overlap merging.
searchableRow.addAll(textBlock.getSequences());
}
cellStarts.add(cellStart);
start = start + cell.toString().trim().length() + 1;
textBlocks.addAll(cell.getTextBlocks());
startOffsets.add(startOffset);
startOffset = startOffset + cell.toString().trim().length() + 1;
}
sectionText.setText(searchableRow.toString());
sectionText.setHeadline(table.getHeadline());
sectionText.setSectionNumber(sectionNumber.intValue());
sectionText.setTable(true);
sectionText.setTabularData(tabularData);
sectionText.setCellStarts(cellStarts);
sectionTexts.add(sectionText);
sectionTexts.add(SectionText.builder()
.text(getRowText(textBlocks))
.headline(table.getHeadline())
.sectionNumber(sectionNumber.intValue())
.isTable(true)
.tabularData(tabularData)
.cellStarts(startOffsets)
.textBlocks(textBlocks)
.sectionAreas(areas)
.build());
sectionNumber.incrementAndGet();
}
@ -144,6 +127,41 @@ public class SectionTextBuilderService {
}
public String getRowText(List<TextBlock> rowTextBlocks) {
return SearchableText.buildString(rowTextBlocks.stream()
.map(textBlock -> textBlock.getSequences())
.flatMap(List::stream)
.collect(Collectors.toList()));
}
private boolean hasTableHeader(Table table) {
return table.getRows()
.stream()
.anyMatch(row -> row.stream()
.anyMatch(cell -> !cell.isHeaderCell() && !cell.getHeaderCells().isEmpty()));
}
private SectionArea getSectionArea(Cell cell) {
return new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
.get(0)
.getSequences()
.get(0)
.getPage(), null);
}
private String getHeaderName(boolean hasHeader, Cell currentCell, Cell cellInFirstRow) {
return hasHeader ? buildHeaderName(currentCell.getHeaderCells()
.get(currentCell.getHeaderCells().size() - 1)) : buildHeaderName(cellInFirstRow);
}
private String buildHeaderName(Cell cell) {
StringBuilder headerBuilder = new StringBuilder();