TAAS-41: TAAS Document Structure

* added linebreaks to ParagraphData
* moved List<String> cellText to List<ParagraphData> cellTexts
This commit is contained in:
Kilian Schuettler 2023-06-22 16:32:08 +02:00 committed by Timo Bejan
parent 7f0aa32d1b
commit 788613c92e
7 changed files with 34 additions and 4 deletions

View File

@ -12,6 +12,7 @@ public class ParagraphData {
private String text;
List<Range> boldTextBoundaries;
List<Range> italicTextBoundaries;
List<Integer> linebreaks;
private String classification;
private String orientation;

View File

@ -10,6 +10,6 @@ import lombok.Data;
public class RowData {
boolean header;
List<String> cellText;
List<ParagraphData> cellText;
float[] bBox;
}

View File

@ -13,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
@ -23,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.Text
public class TaasDocumentDataMapper {
public static ResearchDocumentData fromDocument(Document document) {
AtomicInteger structureObjectNumber = new AtomicInteger();
List<StructureObject> structureObjects = document.streamAllSubNodes()
.filter(node -> !node.getType().equals(NodeType.TABLE_CELL))
@ -38,24 +40,32 @@ public class TaasDocumentDataMapper {
return ResearchDocumentData.builder().structureObjects(structureObjects).build();
}
public static ParagraphData fromTextBlock(String classification, TextBlock textBlock) {
return ParagraphData.builder()
.boldTextBoundaries(textBlock.getBoldTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList())
.italicTextBoundaries(textBlock.getItalicTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList())
.text(textBlock.getSearchText())
.linebreaks(textBlock.getLineBreaks())
.classification(classification)
.orientation(textBlock.getOrientation())
.textDirection(textBlock.getTextDirection())
.build();
}
public static TableData fromTable(Table table) {
List<RowData> rowData = IntStream.range(0, table.getNumberOfRows()).boxed().map(rowIdx -> table.streamRow(rowIdx).toList()).map(TaasDocumentDataMapper::fromTableCells).toList();
List<RowData> rowData = IntStream.range(0, table.getNumberOfRows())
.boxed()
.map(rowIdx -> table.streamRow(rowIdx).toList())
.map(TaasDocumentDataMapper::fromTableCells)
.toList();
return new TableData(rowData, table.getNumberOfCols(), table.getNumberOfRows());
}
public static RowData fromTableCells(List<TableCell> tableCells) {
if (tableCells.isEmpty()) {
@ -67,8 +77,20 @@ public class TaasDocumentDataMapper {
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
return map2;
}).orElseThrow().get(firstPage);
List<String> cellText = tableCells.stream().map(TableCell::getTextBlock).map(TextBlock::getSearchText).toList();
return new RowData(header, cellText, toFloatArray(bBox));
List<TextBlock> textBlocks = tableCells.stream().map(TableCell::getTextBlock).toList();
return new RowData(header, textBlocks.stream().map(textBlock -> TaasDocumentDataMapper.fromTextBlock("table_cell", textBlock)).toList(), toFloatArray(bBox));
}
private static Range toRange(Boundary boundary) {
return new Range(boundary.start(), boundary.end());
}
private static List<Range> toRange(List<Boundary> boundary) {
return boundary.stream().map(TaasDocumentDataMapper::toRange).toList();
}
@ -101,8 +123,10 @@ public class TaasDocumentDataMapper {
.build();
}
private static float[] toFloatArray(Rectangle2D bBox) {
return new float[]{(float) bBox.getX(), (float) bBox.getY(), (float) bBox.getWidth(), (float) bBox.getHeight()};
}
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
public class DocumentDataTests {
}