TAAS-41: TAAS Document Structure
* added linebreaks to ParagraphData * moved List<String> cellText to List<ParagraphData> cellTexts
This commit is contained in:
parent
7f0aa32d1b
commit
788613c92e
@ -12,6 +12,7 @@ public class ParagraphData {
|
||||
private String text;
|
||||
List<Range> boldTextBoundaries;
|
||||
List<Range> italicTextBoundaries;
|
||||
List<Integer> linebreaks;
|
||||
private String classification;
|
||||
|
||||
private String orientation;
|
||||
|
||||
@ -10,6 +10,6 @@ import lombok.Data;
|
||||
public class RowData {
|
||||
|
||||
boolean header;
|
||||
List<String> cellText;
|
||||
List<ParagraphData> cellText;
|
||||
float[] bBox;
|
||||
}
|
||||
|
||||
@ -13,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
@ -23,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.Text
|
||||
public class TaasDocumentDataMapper {
|
||||
|
||||
public static ResearchDocumentData fromDocument(Document document) {
|
||||
|
||||
AtomicInteger structureObjectNumber = new AtomicInteger();
|
||||
List<StructureObject> structureObjects = document.streamAllSubNodes()
|
||||
.filter(node -> !node.getType().equals(NodeType.TABLE_CELL))
|
||||
@ -38,24 +40,32 @@ public class TaasDocumentDataMapper {
|
||||
return ResearchDocumentData.builder().structureObjects(structureObjects).build();
|
||||
}
|
||||
|
||||
|
||||
public static ParagraphData fromTextBlock(String classification, TextBlock textBlock) {
|
||||
|
||||
return ParagraphData.builder()
|
||||
.boldTextBoundaries(textBlock.getBoldTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList())
|
||||
.italicTextBoundaries(textBlock.getItalicTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList())
|
||||
.text(textBlock.getSearchText())
|
||||
.linebreaks(textBlock.getLineBreaks())
|
||||
.classification(classification)
|
||||
.orientation(textBlock.getOrientation())
|
||||
.textDirection(textBlock.getTextDirection())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public static TableData fromTable(Table table) {
|
||||
|
||||
List<RowData> rowData = IntStream.range(0, table.getNumberOfRows()).boxed().map(rowIdx -> table.streamRow(rowIdx).toList()).map(TaasDocumentDataMapper::fromTableCells).toList();
|
||||
List<RowData> rowData = IntStream.range(0, table.getNumberOfRows())
|
||||
.boxed()
|
||||
.map(rowIdx -> table.streamRow(rowIdx).toList())
|
||||
.map(TaasDocumentDataMapper::fromTableCells)
|
||||
.toList();
|
||||
return new TableData(rowData, table.getNumberOfCols(), table.getNumberOfRows());
|
||||
}
|
||||
|
||||
|
||||
public static RowData fromTableCells(List<TableCell> tableCells) {
|
||||
|
||||
if (tableCells.isEmpty()) {
|
||||
@ -67,8 +77,20 @@ public class TaasDocumentDataMapper {
|
||||
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
|
||||
return map2;
|
||||
}).orElseThrow().get(firstPage);
|
||||
List<String> cellText = tableCells.stream().map(TableCell::getTextBlock).map(TextBlock::getSearchText).toList();
|
||||
return new RowData(header, cellText, toFloatArray(bBox));
|
||||
List<TextBlock> textBlocks = tableCells.stream().map(TableCell::getTextBlock).toList();
|
||||
return new RowData(header, textBlocks.stream().map(textBlock -> TaasDocumentDataMapper.fromTextBlock("table_cell", textBlock)).toList(), toFloatArray(bBox));
|
||||
}
|
||||
|
||||
|
||||
private static Range toRange(Boundary boundary) {
|
||||
|
||||
return new Range(boundary.start(), boundary.end());
|
||||
}
|
||||
|
||||
|
||||
private static List<Range> toRange(List<Boundary> boundary) {
|
||||
|
||||
return boundary.stream().map(TaasDocumentDataMapper::toRange).toList();
|
||||
}
|
||||
|
||||
|
||||
@ -101,8 +123,10 @@ public class TaasDocumentDataMapper {
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static float[] toFloatArray(Rectangle2D bBox) {
|
||||
|
||||
return new float[]{(float) bBox.getX(), (float) bBox.getY(), (float) bBox.getWidth(), (float) bBox.getHeight()};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
public class DocumentDataTests {
|
||||
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user