Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7bb2293915 | ||
|
|
3a57d26e97 | ||
|
|
e3819349cf | ||
|
|
e68869495a | ||
|
|
4fb0de82ec |
@ -0,0 +1,11 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.clarifynd;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class IndexData {
|
||||||
|
|
||||||
|
Map<String, String> identifier;
|
||||||
|
List<TextChunk> textChunks;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.clarifynd;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class TextChunk {
|
||||||
|
|
||||||
|
String text;
|
||||||
|
}
|
||||||
@ -10,36 +10,25 @@ import lombok.NonNull;
|
|||||||
@Builder
|
@Builder
|
||||||
@Schema(description = "Object containing all storage paths the service needs to know.")
|
@Schema(description = "Object containing all storage paths the service needs to know.")
|
||||||
public record LayoutParsingRequest(
|
public record LayoutParsingRequest(
|
||||||
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
|
|
||||||
@NonNull LayoutParsingType layoutParsingType,
|
@NonNull LayoutParsingType layoutParsingType,
|
||||||
|
|
||||||
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
|
|
||||||
Map<String, String> identifier,
|
Map<String, String> identifier,
|
||||||
|
|
||||||
@Schema(description = "Path to the original PDF file.")//
|
@NonNull String originFileStorageId,
|
||||||
@NonNull String originFileStorageId,//
|
|
||||||
|
|
||||||
|
Optional<String> tablesFileStorageId,
|
||||||
|
Optional<String> imagesFileStorageId,
|
||||||
|
|
||||||
@Schema(description = "Optional Path to the table extraction file.")//
|
Optional<String> visualLayoutParsingFileId,
|
||||||
Optional<String> tablesFileStorageId,//
|
|
||||||
@Schema(description = "Optional Path to the image classification file.")//
|
|
||||||
Optional<String> imagesFileStorageId,//
|
|
||||||
|
|
||||||
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
|
@NonNull String structureFileStorageId,
|
||||||
|
String researchDocumentStorageId,
|
||||||
@Schema(description = "Path where the Document Structure File will be stored.")//
|
String markdownDocumentStorageId,
|
||||||
@NonNull String structureFileStorageId,//
|
@NonNull String textBlockFileStorageId,
|
||||||
@Schema(description = "Path where the Research Data File will be stored.")//
|
@NonNull String positionBlockFileStorageId,
|
||||||
String researchDocumentStorageId,//
|
@NonNull String pageFileStorageId,
|
||||||
@Schema(description = "Path where the Document Text File will be stored.")//
|
@NonNull String simplifiedTextStorageId,
|
||||||
@NonNull String textBlockFileStorageId,//
|
@NonNull String viewerDocumentStorageId
|
||||||
@Schema(description = "Path where the Document Positions File will be stored.")//
|
) {
|
||||||
@NonNull String positionBlockFileStorageId,//
|
|
||||||
@Schema(description = "Path where the Document Pages File will be stored.")//
|
|
||||||
@NonNull String pageFileStorageId,//
|
|
||||||
@Schema(description = "Path where the Simplified Text File will be stored.")//
|
|
||||||
@NonNull String simplifiedTextStorageId,//
|
|
||||||
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
|
|
||||||
@NonNull String viewerDocumentStorageId) {
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -8,5 +8,6 @@ public enum LayoutParsingType {
|
|||||||
DOCUMINE,
|
DOCUMINE,
|
||||||
DOCUMINE_OLD,
|
DOCUMINE_OLD,
|
||||||
CLARIFYND,
|
CLARIFYND,
|
||||||
CLARIFYND_PARAGRAPH_DEBUG
|
CLARIFYND_PARAGRAPH_DEBUG,
|
||||||
|
MARKDOWN
|
||||||
}
|
}
|
||||||
|
|||||||
@ -26,4 +26,10 @@ dependencies {
|
|||||||
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
||||||
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
||||||
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
||||||
|
implementation("org.commonmark:commonmark:0.22.0")
|
||||||
|
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||||
|
implementation("com.didalgo:gpt3-tokenizer:0.1.8")
|
||||||
|
|
||||||
|
implementation("org.mapstruct:mapstruct:1.5.5.Final")
|
||||||
|
annotationProcessor("org.mapstruct:mapstruct-processor:1.5.5.Final")
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import static java.lang.String.format;
|
|||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
@ -18,12 +19,15 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
|||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
|
import org.commonmark.ext.gfm.tables.TablesExtension;
|
||||||
|
import org.commonmark.renderer.markdown.MarkdownRenderer;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.markdown.DocumentDataParser;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
@ -120,24 +124,18 @@ public class LayoutParsingPipeline {
|
|||||||
File viewerDocumentFile = originFile;
|
File viewerDocumentFile = originFile;
|
||||||
|
|
||||||
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
||||||
if (layoutParsingRequest.visualLayoutParsingFileId()
|
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
||||||
.isPresent()) {
|
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
|
||||||
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
|
|
||||||
.get());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||||
if (layoutParsingRequest.imagesFileStorageId()
|
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||||
.isPresent()) {
|
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
|
||||||
.get());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||||
if (layoutParsingRequest.tablesFileStorageId()
|
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||||
.isPresent()) {
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
|
||||||
.get());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||||
@ -163,12 +161,22 @@ public class LayoutParsingPipeline {
|
|||||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||||
|
|
||||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND)) {
|
if (layoutParsingRequest.researchDocumentStorageId() != null) {
|
||||||
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (layoutParsingRequest.markdownDocumentStorageId() != null) {
|
||||||
|
log.info("Rendering document data as markdown for {}", layoutParsingRequest.identifier());
|
||||||
|
var markdownDocument = DocumentDataParser.parse(documentGraph.streamAllSubNodes());
|
||||||
|
MarkdownRenderer renderer = MarkdownRenderer.builder().extensions(List.of(TablesExtension.create())).build();
|
||||||
|
String markdown = renderer.render(markdownDocument);
|
||||||
|
try (var in = new ByteArrayInputStream(markdown.getBytes())) {
|
||||||
|
layoutParsingStorageService.storeObject(layoutParsingRequest.markdownDocumentStorageId(), in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!viewerDocumentFile.equals(originFile)) {
|
if (!viewerDocumentFile.equals(originFile)) {
|
||||||
viewerDocumentFile.delete();
|
viewerDocumentFile.delete();
|
||||||
}
|
}
|
||||||
@ -254,7 +262,7 @@ public class LayoutParsingPipeline {
|
|||||||
OutlineObject lastProcessedOutlineObject = null;
|
OutlineObject lastProcessedOutlineObject = null;
|
||||||
|
|
||||||
// parsing the structure elements could be useful as well
|
// parsing the structure elements could be useful as well
|
||||||
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -302,13 +310,9 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||||
|
|
||||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(),
|
||||||
pdPage,
|
|
||||||
pageNumber,
|
|
||||||
cleanRulings,
|
|
||||||
stripper.getTextPositionSequences(),
|
|
||||||
|
|
||||||
false);
|
false);
|
||||||
|
|
||||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||||
.addAll(graphics.stream()
|
.addAll(graphics.stream()
|
||||||
@ -319,16 +323,11 @@ public class LayoutParsingPipeline {
|
|||||||
case REDACT_MANAGER_OLD ->
|
case REDACT_MANAGER_OLD ->
|
||||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words,
|
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||||
cleanRulings,
|
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
|
||||||
true,
|
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||||
classificationDocument.getVisualizations(),
|
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
|
||||||
layoutParsingType);
|
default -> throw new IllegalArgumentException("Unexpected LayoutParsingType: " + layoutParsingType);
|
||||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words,
|
|
||||||
cleanRulings,
|
|
||||||
false,
|
|
||||||
classificationDocument.getVisualizations(),
|
|
||||||
layoutParsingType);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
@ -338,7 +337,7 @@ public class LayoutParsingPipeline {
|
|||||||
classificationPage.setPageWidth(cropbox.getWidth());
|
classificationPage.setPageWidth(cropbox.getWidth());
|
||||||
classificationPage.setPageHeight(cropbox.getHeight());
|
classificationPage.setPageHeight(cropbox.getHeight());
|
||||||
|
|
||||||
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||||
|
|
||||||
OutlineObject notFoundOutlineObject = null;
|
OutlineObject notFoundOutlineObject = null;
|
||||||
@ -387,8 +386,8 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
log.info("Classify TextBlocks for {}", identifier);
|
log.info("Classify TextBlocks for {}", identifier);
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument(
|
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||||
classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -102,6 +102,11 @@ public class LayoutParsingStorageService {
|
|||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void storeObject(String storageId, InputStream in) {
|
||||||
|
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(), storageId, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
|
private File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,74 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.commonmark.Extension;
|
||||||
|
import org.commonmark.ext.gfm.tables.TablesExtension;
|
||||||
|
import org.commonmark.node.Document;
|
||||||
|
import org.commonmark.node.Node;
|
||||||
|
import org.commonmark.parser.Parser;
|
||||||
|
import org.commonmark.renderer.Renderer;
|
||||||
|
import org.commonmark.renderer.markdown.MarkdownRenderer;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.iqser.red.storage.commons.service.StorageService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.markdown.MarkdownChunker;
|
||||||
|
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||||
|
|
||||||
|
import io.micrometer.observation.annotation.Observed;
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class MarkdownParsingPipeline {
|
||||||
|
|
||||||
|
StorageService storageService;
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Observed(name = "MarkdownParsingPipeline", contextualName = "parse-markdown")
|
||||||
|
public LayoutParsingFinishedEvent parseMarkdownAndSaveToStorage(LayoutParsingRequest request) {
|
||||||
|
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
String markdown;
|
||||||
|
try (var in = storageService.getObject(TenantContext.getTenantId(), request.originFileStorageId()).getInputStream()) {
|
||||||
|
markdown = new String(in.readAllBytes(), StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
Parser parser = buildParser();
|
||||||
|
Node node = parser.parse(markdown);
|
||||||
|
|
||||||
|
MarkdownChunker chunker = new MarkdownChunker(600);
|
||||||
|
|
||||||
|
node.accept(chunker);
|
||||||
|
|
||||||
|
Renderer renderer = buildRenderer();
|
||||||
|
List<Document> markdownChunks = chunker.getResult();
|
||||||
|
for (Document markdownChunk : markdownChunks) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return LayoutParsingFinishedEvent.builder().identifier(request.identifier()).numberOfPages(1).duration(System.currentTimeMillis() - start).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Parser buildParser() {
|
||||||
|
|
||||||
|
List<Extension> extensions = List.of(TablesExtension.create());
|
||||||
|
return Parser.builder().extensions(extensions).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static MarkdownRenderer buildRenderer() {
|
||||||
|
|
||||||
|
List<Extension> extensions = List.of(TablesExtension.create());
|
||||||
|
return MarkdownRenderer.builder().extensions(extensions).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,305 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.commonmark.ext.gfm.tables.TableBlock;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableBody;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableCell;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableHead;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableRow;
|
||||||
|
import org.commonmark.node.Document;
|
||||||
|
import org.commonmark.node.Emphasis;
|
||||||
|
import org.commonmark.node.HardLineBreak;
|
||||||
|
import org.commonmark.node.Heading;
|
||||||
|
import org.commonmark.node.Node;
|
||||||
|
import org.commonmark.node.SoftLineBreak;
|
||||||
|
import org.commonmark.node.StrongEmphasis;
|
||||||
|
import org.commonmark.node.Text;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class DocumentDataParser {
|
||||||
|
|
||||||
|
public Document parse(Stream<SemanticNode> semanticNodes) {
|
||||||
|
|
||||||
|
Document document = new Document();
|
||||||
|
semanticNodes.map(DocumentDataParser::parseNode)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.forEach(document::appendChild);
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Node parseNode(SemanticNode semanticNode) {
|
||||||
|
|
||||||
|
return switch (semanticNode.getType()) {
|
||||||
|
case HEADLINE -> parseHeadline((Headline) semanticNode);
|
||||||
|
case PARAGRAPH -> parseParagraph((Paragraph) semanticNode);
|
||||||
|
case TABLE -> parseTable((Table) semanticNode);
|
||||||
|
default -> null;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TableBlock parseTable(Table table) {
|
||||||
|
|
||||||
|
TableBlock tableNode = new TableBlock();
|
||||||
|
TableHead head = new TableHead();
|
||||||
|
TableRow tableRow = createTableRow(table, 0);
|
||||||
|
head.appendChild(tableRow);
|
||||||
|
int row = 1;
|
||||||
|
for (; row < table.getNumberOfRows() && table.streamRow(row)
|
||||||
|
.allMatch(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell::isHeader); row++) {
|
||||||
|
head.appendChild(createTableRow(table, row));
|
||||||
|
}
|
||||||
|
tableNode.appendChild(head);
|
||||||
|
TableBody tableBody = new TableBody();
|
||||||
|
for (; row < table.getNumberOfRows(); row++) {
|
||||||
|
tableBody.appendChild(createTableRow(table, row));
|
||||||
|
}
|
||||||
|
tableNode.appendChild(tableBody);
|
||||||
|
return tableNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TableRow createTableRow(Table table, int row) {
|
||||||
|
|
||||||
|
TableRow tableRow = new TableRow();
|
||||||
|
table.streamRow(row)
|
||||||
|
.map(DocumentDataParser::createTableCell)
|
||||||
|
.forEach(tableRow::appendChild);
|
||||||
|
return tableRow;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
|
||||||
|
|
||||||
|
var cell = new TableCell();
|
||||||
|
parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
|
||||||
|
return cell;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) {
|
||||||
|
|
||||||
|
org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph();
|
||||||
|
parseTextBlockWithLineBreaks(paragraph.getTextBlock()).forEach(heading::appendChild);
|
||||||
|
return heading;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Heading parseHeadline(Headline headline) {
|
||||||
|
|
||||||
|
Heading heading = new Heading();
|
||||||
|
heading.setLevel(headline.getTreeId().size());
|
||||||
|
parseTextBlockWithLineBreaks(headline.getTextBlock()).forEach(heading::appendChild);
|
||||||
|
return heading;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Node> parseTextBlockWithLineBreaks(TextBlock textBlock) {
|
||||||
|
|
||||||
|
LinkedList<Node> result = new LinkedList<>();
|
||||||
|
List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
|
||||||
|
for (TextRangeWithTextType textRange : textRanges) {
|
||||||
|
if (textBlock.subSequenceWithLineBreaks(textRange.textRange()).equals("\n")) {
|
||||||
|
result.add(new HardLineBreak());
|
||||||
|
}
|
||||||
|
String text = textBlock.subSequenceWithLineBreaks(textRange.textRange());
|
||||||
|
String[] lines = text.split("\n");
|
||||||
|
for (String line : lines) {
|
||||||
|
String cleanedLine = line.trim();
|
||||||
|
if (cleanedLine.isEmpty()) {
|
||||||
|
result.add(new HardLineBreak());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
switch (textRange.fontStyle()) {
|
||||||
|
case REGULAR -> result.add(new Text(cleanedLine));
|
||||||
|
case BOLD -> {
|
||||||
|
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||||
|
boldBlock.appendChild(new Text(cleanedLine));
|
||||||
|
result.add(boldBlock);
|
||||||
|
}
|
||||||
|
case ITALIC -> {
|
||||||
|
Emphasis italicBlock = new Emphasis("_");
|
||||||
|
italicBlock.appendChild(new Text(cleanedLine));
|
||||||
|
result.add(italicBlock);
|
||||||
|
}
|
||||||
|
case BOLD_ITALIC -> {
|
||||||
|
Emphasis italicBlock = new Emphasis("_");
|
||||||
|
|
||||||
|
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||||
|
boldBlock.appendChild(new Text(cleanedLine));
|
||||||
|
|
||||||
|
italicBlock.appendChild(boldBlock);
|
||||||
|
result.add(italicBlock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.add(new HardLineBreak());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.removeLast();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Node> parseTextBlock(TextBlock textBlock) {
|
||||||
|
|
||||||
|
List<Node> result = new ArrayList<>();
|
||||||
|
List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
|
||||||
|
for (TextRangeWithTextType textRange : textRanges) {
|
||||||
|
switch (textRange.fontStyle()) {
|
||||||
|
case REGULAR -> result.add(new Text(textBlock.subSequence(textRange.textRange()).toString()));
|
||||||
|
case BOLD -> {
|
||||||
|
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||||
|
boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
|
||||||
|
result.add(boldBlock);
|
||||||
|
}
|
||||||
|
case ITALIC -> {
|
||||||
|
Emphasis italicBlock = new Emphasis("_");
|
||||||
|
italicBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
|
||||||
|
result.add(italicBlock);
|
||||||
|
}
|
||||||
|
case BOLD_ITALIC -> {
|
||||||
|
Emphasis italicBlock = new Emphasis("_");
|
||||||
|
|
||||||
|
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||||
|
boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
|
||||||
|
|
||||||
|
italicBlock.appendChild(boldBlock);
|
||||||
|
result.add(italicBlock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<TextRangeWithTextType> mergeTextStyles(TextBlock textBlock) {
|
||||||
|
|
||||||
|
List<TextRangeWithTextType> result = new ArrayList<>();
|
||||||
|
|
||||||
|
TreeMap<Integer, Set<FontStyleChange>> styleChanges = new TreeMap<>();
|
||||||
|
|
||||||
|
int start = textBlock.getTextRange().start();
|
||||||
|
int end = textBlock.getTextRange().end();
|
||||||
|
|
||||||
|
for (TextRange bold : textBlock.getBoldTextBoundaries()) {
|
||||||
|
styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD));
|
||||||
|
styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (TextRange italic : textBlock.getItalicTextBoundaries()) {
|
||||||
|
styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC));
|
||||||
|
styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (styleChanges.isEmpty()) {
|
||||||
|
result.add(new TextRangeWithTextType(new TextRange(start, end), FontStyle.REGULAR));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<FontStyle> currentStyles = new HashSet<>();
|
||||||
|
currentStyles.add(FontStyle.REGULAR);
|
||||||
|
|
||||||
|
for (Map.Entry<Integer, Set<FontStyleChange>> entry : styleChanges.entrySet()) {
|
||||||
|
int point = entry.getKey();
|
||||||
|
Set<FontStyleChange> changes = entry.getValue();
|
||||||
|
|
||||||
|
if (point > start) {
|
||||||
|
FontStyle style = determineFontStyle(currentStyles);
|
||||||
|
result.add(new TextRangeWithTextType(new TextRange(start, point), style));
|
||||||
|
}
|
||||||
|
|
||||||
|
changes.stream()
|
||||||
|
.filter(FontStyleChange::leave)
|
||||||
|
.map(FontStyleChange::style)
|
||||||
|
.toList()
|
||||||
|
.forEach(currentStyles::remove);
|
||||||
|
|
||||||
|
currentStyles.addAll(changes.stream()
|
||||||
|
.filter(FontStyleChange::enter)
|
||||||
|
.map(FontStyleChange::style)
|
||||||
|
.toList());
|
||||||
|
|
||||||
|
if (currentStyles.isEmpty()) {
|
||||||
|
currentStyles.add(FontStyle.REGULAR);
|
||||||
|
}
|
||||||
|
|
||||||
|
start = point;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start < end) {
|
||||||
|
FontStyle style = determineFontStyle(currentStyles);
|
||||||
|
result.add(new TextRangeWithTextType(new TextRange(start, textBlock.getTextRange().end()), style));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.stream()
|
||||||
|
.filter(t -> t.textRange.length() > 1)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private FontStyle determineFontStyle(Set<FontStyle> styles) {
|
||||||
|
|
||||||
|
if (styles.contains(FontStyle.BOLD) && styles.contains(FontStyle.ITALIC)) {
|
||||||
|
return FontStyle.BOLD_ITALIC;
|
||||||
|
} else if (styles.contains(FontStyle.BOLD)) {
|
||||||
|
return FontStyle.BOLD;
|
||||||
|
} else if (styles.contains(FontStyle.ITALIC)) {
|
||||||
|
return FontStyle.ITALIC;
|
||||||
|
} else {
|
||||||
|
return FontStyle.REGULAR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
enum FontStyle {
|
||||||
|
REGULAR,
|
||||||
|
BOLD,
|
||||||
|
ITALIC,
|
||||||
|
BOLD_ITALIC;
|
||||||
|
}
|
||||||
|
|
||||||
|
record FontStyleChange(boolean enter, FontStyle style) {
|
||||||
|
|
||||||
|
public static FontStyleChange enter(FontStyle style) {
|
||||||
|
|
||||||
|
return new FontStyleChange(true, style);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static FontStyleChange leave(FontStyle style) {
|
||||||
|
|
||||||
|
return new FontStyleChange(false, style);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean leave() {
|
||||||
|
|
||||||
|
return !enter;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||||
|
|
||||||
|
import org.commonmark.node.IndentedCodeBlock;
|
||||||
|
import org.commonmark.node.Paragraph;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter;
|
||||||
|
|
||||||
|
public class ExtraTokens {
|
||||||
|
|
||||||
|
public static int INDENTED_CODE_BLOCK = 10;
|
||||||
|
public static int PARAGRAPH = 10;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,527 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline.buildRenderer;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter.countTokens;
|
||||||
|
|
||||||
|
import java.text.BreakIterator;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Deque;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.commonmark.ext.gfm.tables.TableBlock;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableBody;
|
||||||
|
import org.commonmark.node.AbstractVisitor;
|
||||||
|
import org.commonmark.node.BlockQuote;
|
||||||
|
import org.commonmark.node.BulletList;
|
||||||
|
import org.commonmark.node.Code;
|
||||||
|
import org.commonmark.node.CustomBlock;
|
||||||
|
import org.commonmark.node.CustomNode;
|
||||||
|
import org.commonmark.node.Document;
|
||||||
|
import org.commonmark.node.Emphasis;
|
||||||
|
import org.commonmark.node.FencedCodeBlock;
|
||||||
|
import org.commonmark.node.HardLineBreak;
|
||||||
|
import org.commonmark.node.Heading;
|
||||||
|
import org.commonmark.node.HtmlBlock;
|
||||||
|
import org.commonmark.node.HtmlInline;
|
||||||
|
import org.commonmark.node.Image;
|
||||||
|
import org.commonmark.node.IndentedCodeBlock;
|
||||||
|
import org.commonmark.node.Link;
|
||||||
|
import org.commonmark.node.LinkReferenceDefinition;
|
||||||
|
import org.commonmark.node.ListBlock;
|
||||||
|
import org.commonmark.node.ListItem;
|
||||||
|
import org.commonmark.node.Node;
|
||||||
|
import org.commonmark.node.OrderedList;
|
||||||
|
import org.commonmark.node.Paragraph;
|
||||||
|
import org.commonmark.node.SoftLineBreak;
|
||||||
|
import org.commonmark.node.StrongEmphasis;
|
||||||
|
import org.commonmark.node.Text;
|
||||||
|
import org.commonmark.node.ThematicBreak;
|
||||||
|
import org.commonmark.renderer.Renderer;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class MarkdownChunker extends AbstractVisitor {
|
||||||
|
|
||||||
|
NodeCopier nodeCopier = NodeCopier.INSTANCE;
|
||||||
|
|
||||||
|
final int tokenLimit;
|
||||||
|
List<Document> allChunks;
|
||||||
|
|
||||||
|
Deque<Heading> currentHeadings;
|
||||||
|
Document chunk;
|
||||||
|
boolean validChunk;
|
||||||
|
|
||||||
|
|
||||||
|
public MarkdownChunker(int tokenLimit) {
|
||||||
|
|
||||||
|
this.tokenLimit = tokenLimit;
|
||||||
|
allChunks = new LinkedList<>();
|
||||||
|
currentHeadings = new LinkedList<>();
|
||||||
|
startNewChunk();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Document> getResult() {
|
||||||
|
|
||||||
|
for (Document chunk : allChunks) {
|
||||||
|
if (countTokens(chunk) > tokenLimit) {
|
||||||
|
throwUnsplittableNodeError(chunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return allChunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(Heading heading) {
|
||||||
|
|
||||||
|
if (heading.getLevel() > 4) {
|
||||||
|
addToChunk(heading);
|
||||||
|
}
|
||||||
|
if (currentHeadings.isEmpty() || currentHeadings.peek().getLevel() < heading.getLevel()) {
|
||||||
|
currentHeadings.push(heading);
|
||||||
|
} else {
|
||||||
|
while (!currentHeadings.isEmpty() && currentHeadings.peek().getLevel() >= heading.getLevel()) {
|
||||||
|
currentHeadings.pop();
|
||||||
|
}
|
||||||
|
currentHeadings.push(heading);
|
||||||
|
}
|
||||||
|
|
||||||
|
startNewChunk();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void startNewChunk() {
|
||||||
|
|
||||||
|
if (!validChunk && !allChunks.isEmpty()) {
|
||||||
|
allChunks.remove(allChunks.size() - 1);
|
||||||
|
}
|
||||||
|
validChunk = false;
|
||||||
|
chunk = buildNewChunk();
|
||||||
|
allChunks.add(chunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Document buildNewChunk() {
|
||||||
|
|
||||||
|
Document document = new Document();
|
||||||
|
List<Node> headingCopies = currentHeadings.stream()
|
||||||
|
.map(nodeCopier::copyNodeWithChildren)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
Collections.reverse(headingCopies);
|
||||||
|
headingCopies.forEach(document::appendChild);
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int currentTokenCount() {
|
||||||
|
|
||||||
|
return TokenCounter.countTokens(chunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean fitsTokenLimit(Node node) {
|
||||||
|
|
||||||
|
Document document = buildNewChunk();
|
||||||
|
document.appendChild(nodeCopier.copyNodeWithChildren(node));
|
||||||
|
return TokenCounter.countTokens(document) <= tokenLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addToChunk(Node node) {
|
||||||
|
|
||||||
|
chunk.appendChild(node);
|
||||||
|
|
||||||
|
if (currentTokenCount() <= tokenLimit) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
node.unlink();
|
||||||
|
startNewChunk();
|
||||||
|
chunk.appendChild(node);
|
||||||
|
|
||||||
|
if (currentTokenCount() > tokenLimit) { // node is too large and won't fit in tokenLimit, split is necessary
|
||||||
|
node.unlink();
|
||||||
|
startNewChunk();
|
||||||
|
splitNodeAndAddToChunk(node);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
validChunk = true;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void splitNodeAndAddToChunk(Node node) {
|
||||||
|
|
||||||
|
if (node instanceof TableBlock tableBlock) {
|
||||||
|
splitTable(tableBlock);
|
||||||
|
return;
|
||||||
|
} else if (node instanceof BulletList bulletList) {
|
||||||
|
splitList(bulletList);
|
||||||
|
return;
|
||||||
|
} else if (node instanceof OrderedList orderedList) {
|
||||||
|
splitList(orderedList);
|
||||||
|
return;
|
||||||
|
} else if (node instanceof Paragraph paragraph) {
|
||||||
|
splitParagraph(paragraph);
|
||||||
|
return;
|
||||||
|
} else if (node instanceof IndentedCodeBlock indentedCodeBlock) {
|
||||||
|
splitCodeBlock(indentedCodeBlock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
throwUnsplittableNodeError(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void splitCodeBlock(IndentedCodeBlock indentedCodeBlock) {
|
||||||
|
|
||||||
|
List<IndentedCodeBlock> splitBlocks = new LinkedList<>();
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
BreakIterator lineIterator = BreakIterator.getLineInstance(Locale.ENGLISH);
|
||||||
|
lineIterator.setText(indentedCodeBlock.getLiteral());
|
||||||
|
int start = lineIterator.first();
|
||||||
|
for (int end = lineIterator.next(); end != BreakIterator.DONE; start = end, end = lineIterator.next()) {
|
||||||
|
String sentence = indentedCodeBlock.getLiteral().substring(start, end);
|
||||||
|
if (!fitsTokenLimit(buildIndentedCodeBlock(sb.toString()))) {
|
||||||
|
sb.replace(sb.length() - sentence.length(), sb.length(), "");
|
||||||
|
IndentedCodeBlock block = buildIndentedCodeBlock(sb.toString());
|
||||||
|
splitBlocks.add(block);
|
||||||
|
sb = new StringBuilder();
|
||||||
|
}
|
||||||
|
sb.append(sentence);
|
||||||
|
}
|
||||||
|
if (!sb.isEmpty()) {
|
||||||
|
if (fitsTokenLimit(buildIndentedCodeBlock(sb.toString()))) {
|
||||||
|
splitBlocks.add(buildIndentedCodeBlock(sb.toString()));
|
||||||
|
} else {
|
||||||
|
int mid = sb.length() / 2;
|
||||||
|
splitBlocks.add(buildIndentedCodeBlock(sb.substring(0, mid)));
|
||||||
|
splitBlocks.add(buildIndentedCodeBlock(sb.substring(mid, sb.length())));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
splitBlocks.forEach(this::addToChunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static IndentedCodeBlock buildIndentedCodeBlock(String string) {
|
||||||
|
|
||||||
|
IndentedCodeBlock block = new IndentedCodeBlock();
|
||||||
|
block.setLiteral(string);
|
||||||
|
return block;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void splitParagraph(Paragraph paragraph) {
|
||||||
|
|
||||||
|
if (fitsTokenLimit(paragraph)) {
|
||||||
|
addToChunk(paragraph);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Node> children = collectAllChildNodes(paragraph);
|
||||||
|
|
||||||
|
if (children.size() == 1) {
|
||||||
|
if (children.get(0) instanceof Text text) {
|
||||||
|
List<Text> splitTexts = splitText(text);
|
||||||
|
for (Text splitText : splitTexts) {
|
||||||
|
Paragraph paragraph1 = new Paragraph();
|
||||||
|
paragraph1.appendChild(splitText);
|
||||||
|
addToChunk(paragraph1);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
throwUnsplittableNodeError(children.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
Paragraph paragraph1 = new Paragraph();
|
||||||
|
Paragraph paragraph2 = new Paragraph();
|
||||||
|
|
||||||
|
int mid = children.size() / 2;
|
||||||
|
children.subList(0, mid)
|
||||||
|
.forEach(paragraph1::appendChild);
|
||||||
|
children.subList(mid, children.size())
|
||||||
|
.forEach(paragraph2::appendChild);
|
||||||
|
|
||||||
|
splitParagraph(paragraph1);
|
||||||
|
splitParagraph(paragraph2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void throwUnsplittableNodeError(Node node) {
|
||||||
|
|
||||||
|
Renderer renderer = buildRenderer();
|
||||||
|
String renderedNode = renderer.render(node);
|
||||||
|
log.error(renderedNode);
|
||||||
|
throw new IllegalArgumentException(String.format("Node %s exceeds token limit (%d/%d) and can't be split!", node, countTokens(renderedNode), tokenLimit));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<Node> collectAllChildNodes(Node parent) {
|
||||||
|
|
||||||
|
List<Node> children = new LinkedList<>();
|
||||||
|
Node next;
|
||||||
|
for (Node child = parent.getFirstChild(); child != null; child = next) {
|
||||||
|
next = child.getNext();
|
||||||
|
children.add(child);
|
||||||
|
}
|
||||||
|
return children;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Text> splitText(Text text) {
|
||||||
|
|
||||||
|
List<Text> splitTexts = new LinkedList<>();
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.ENGLISH);
|
||||||
|
sentenceIterator.setText(text.getLiteral());
|
||||||
|
int start = sentenceIterator.first();
|
||||||
|
for (int end = sentenceIterator.next(); end != BreakIterator.DONE; start = end, end = sentenceIterator.next()) {
|
||||||
|
String sentence = text.getLiteral().substring(start, end);
|
||||||
|
if (!fitsTokenLimit(buildParagraphWithText(sb))) {
|
||||||
|
sb.replace(sb.length() - sentence.length(), sb.length(), "");
|
||||||
|
splitTexts.add(new Text(sb.toString()));
|
||||||
|
sb = new StringBuilder();
|
||||||
|
}
|
||||||
|
sb.append(sentence);
|
||||||
|
}
|
||||||
|
if (!sb.isEmpty()) {
|
||||||
|
if (fitsTokenLimit(buildParagraphWithText(sb))) {
|
||||||
|
splitTexts.add(new Text(sb.toString()));
|
||||||
|
} else {
|
||||||
|
int mid = sb.length() / 2;
|
||||||
|
splitTexts.add(new Text(sb.substring(0, mid)));
|
||||||
|
splitTexts.add(new Text(sb.substring(mid, sb.length())));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return splitTexts;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Paragraph buildParagraphWithText(StringBuilder sb) {
|
||||||
|
|
||||||
|
Paragraph paragraph = new Paragraph();
|
||||||
|
paragraph.appendChild(new Text(sb.toString()));
|
||||||
|
return paragraph;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void splitList(BulletList bulletList) {
|
||||||
|
|
||||||
|
if (fitsTokenLimit(bulletList)) {
|
||||||
|
addToChunk(bulletList);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
BulletList list1 = new BulletList();
|
||||||
|
BulletList list2 = new BulletList();
|
||||||
|
|
||||||
|
splitLists(bulletList, list1, list2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void splitList(OrderedList orderedList) {
|
||||||
|
|
||||||
|
if (fitsTokenLimit(orderedList)) {
|
||||||
|
addToChunk(orderedList);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
OrderedList list1 = new OrderedList();
|
||||||
|
OrderedList list2 = new OrderedList();
|
||||||
|
|
||||||
|
splitLists(orderedList, list1, list2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void splitLists(ListBlock originList, ListBlock list1, ListBlock list2) {
|
||||||
|
|
||||||
|
List<Node> listItems = collectAllChildNodes(originList);
|
||||||
|
|
||||||
|
if (listItems.size() == 1) {
|
||||||
|
collectAllChildNodes(listItems.get(0)).forEach(this::addToChunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
int mid = listItems.size() / 2;
|
||||||
|
listItems.subList(0, mid)
|
||||||
|
.forEach(list1::appendChild);
|
||||||
|
listItems.subList(mid, listItems.size())
|
||||||
|
.forEach(list2::appendChild);
|
||||||
|
|
||||||
|
splitNodeAndAddToChunk(list1);
|
||||||
|
splitNodeAndAddToChunk(list2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void splitTable(TableBlock tableBlock) {
|
||||||
|
|
||||||
|
if (fitsTokenLimit(tableBlock)) {
|
||||||
|
addToChunk(tableBlock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
TableBlock tableBlock1 = new TableBlock();
|
||||||
|
TableBlock tableBlock2 = new TableBlock();
|
||||||
|
|
||||||
|
tableBlock1.appendChild(nodeCopier.copy(tableBlock.getFirstChild()));
|
||||||
|
tableBlock2.appendChild(nodeCopier.copy(tableBlock.getFirstChild()));
|
||||||
|
|
||||||
|
TableBody tableBody1 = new TableBody();
|
||||||
|
TableBody tableBody2 = new TableBody();
|
||||||
|
|
||||||
|
List<Node> tableRows = collectAllChildNodes(tableBlock.getLastChild());
|
||||||
|
|
||||||
|
if (tableRows.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("The table headers already exceeds the token limit");
|
||||||
|
}
|
||||||
|
if (tableRows.size() == 1) {
|
||||||
|
throw new IllegalArgumentException("A single table row already exceeds the token limit");
|
||||||
|
}
|
||||||
|
|
||||||
|
int mid = tableRows.size() / 2;
|
||||||
|
tableRows.subList(0, mid)
|
||||||
|
.forEach(tableBody1::appendChild);
|
||||||
|
tableRows.subList(mid, tableRows.size())
|
||||||
|
.forEach(tableBody2::appendChild);
|
||||||
|
|
||||||
|
splitTable(tableBlock1);
|
||||||
|
splitTable(tableBlock2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(BlockQuote blockQuote) {
|
||||||
|
|
||||||
|
this.addToChunk(blockQuote);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(BulletList bulletList) {
|
||||||
|
|
||||||
|
this.addToChunk(bulletList);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(Code code) {
|
||||||
|
|
||||||
|
this.addToChunk(code);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(Emphasis emphasis) {
|
||||||
|
|
||||||
|
this.addToChunk(emphasis);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(FencedCodeBlock fencedCodeBlock) {
|
||||||
|
|
||||||
|
this.addToChunk(fencedCodeBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(HardLineBreak hardLineBreak) {
|
||||||
|
|
||||||
|
this.addToChunk(hardLineBreak);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(ThematicBreak thematicBreak) {
|
||||||
|
|
||||||
|
this.addToChunk(thematicBreak);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(HtmlInline htmlInline) {
|
||||||
|
|
||||||
|
this.addToChunk(htmlInline);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(HtmlBlock htmlBlock) {
|
||||||
|
|
||||||
|
this.addToChunk(htmlBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(Image image) {
|
||||||
|
|
||||||
|
this.addToChunk(image);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(IndentedCodeBlock indentedCodeBlock) {
|
||||||
|
|
||||||
|
this.addToChunk(indentedCodeBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(Link link) {
|
||||||
|
|
||||||
|
this.addToChunk(link);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(ListItem listItem) {
|
||||||
|
|
||||||
|
this.addToChunk(listItem);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(OrderedList orderedList) {
|
||||||
|
|
||||||
|
this.addToChunk(orderedList);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(Paragraph paragraph) {
|
||||||
|
|
||||||
|
this.addToChunk(paragraph);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(SoftLineBreak softLineBreak) {
|
||||||
|
|
||||||
|
this.addToChunk(softLineBreak);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(StrongEmphasis strongEmphasis) {
|
||||||
|
|
||||||
|
this.addToChunk(strongEmphasis);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(Text text) {
|
||||||
|
|
||||||
|
this.addToChunk(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(LinkReferenceDefinition linkReferenceDefinition) {
|
||||||
|
|
||||||
|
this.addToChunk(linkReferenceDefinition);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(CustomBlock customBlock) {
|
||||||
|
|
||||||
|
this.addToChunk(customBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void visit(CustomNode customNode) {
|
||||||
|
|
||||||
|
this.addToChunk(customNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,171 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||||
|
|
||||||
|
import org.commonmark.ext.gfm.tables.TableBlock;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableBody;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableCell;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableHead;
|
||||||
|
import org.commonmark.ext.gfm.tables.TableRow;
|
||||||
|
import org.commonmark.node.BlockQuote;
|
||||||
|
import org.commonmark.node.BulletList;
|
||||||
|
import org.commonmark.node.Code;
|
||||||
|
import org.commonmark.node.Document;
|
||||||
|
import org.commonmark.node.Emphasis;
|
||||||
|
import org.commonmark.node.FencedCodeBlock;
|
||||||
|
import org.commonmark.node.HardLineBreak;
|
||||||
|
import org.commonmark.node.Heading;
|
||||||
|
import org.commonmark.node.HtmlBlock;
|
||||||
|
import org.commonmark.node.HtmlInline;
|
||||||
|
import org.commonmark.node.Image;
|
||||||
|
import org.commonmark.node.IndentedCodeBlock;
|
||||||
|
import org.commonmark.node.Link;
|
||||||
|
import org.commonmark.node.LinkReferenceDefinition;
|
||||||
|
import org.commonmark.node.ListItem;
|
||||||
|
import org.commonmark.node.Node;
|
||||||
|
import org.commonmark.node.OrderedList;
|
||||||
|
import org.commonmark.node.Paragraph;
|
||||||
|
import org.commonmark.node.SoftLineBreak;
|
||||||
|
import org.commonmark.node.StrongEmphasis;
|
||||||
|
import org.commonmark.node.Text;
|
||||||
|
import org.commonmark.node.ThematicBreak;
|
||||||
|
import org.mapstruct.Mapper;
|
||||||
|
import org.mapstruct.Mapping;
|
||||||
|
import org.mapstruct.factory.Mappers;
|
||||||
|
|
||||||
|
@Mapper
|
||||||
|
public interface NodeCopier {
|
||||||
|
|
||||||
|
NodeCopier INSTANCE = Mappers.getMapper(NodeCopier.class);
|
||||||
|
|
||||||
|
|
||||||
|
default Node copyNodeWithChildren(Node node) {
|
||||||
|
|
||||||
|
Node copy = copy(node);
|
||||||
|
|
||||||
|
Node next;
|
||||||
|
for (Node child = node.getFirstChild(); child != null; child = next) {
|
||||||
|
next = child.getNext();
|
||||||
|
copy.appendChild(copyNodeWithChildren(child));
|
||||||
|
}
|
||||||
|
return copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
default Node copy(Node node) {
|
||||||
|
|
||||||
|
return switch (node.getClass().getSimpleName()) {
|
||||||
|
case "BlockQuote" -> copy((BlockQuote) node);
|
||||||
|
case "BulletList" -> copy((BulletList) node);
|
||||||
|
case "Code" -> copy((Code) node);
|
||||||
|
case "Document" -> copy((Document) node);
|
||||||
|
case "Emphasis" -> copy((Emphasis) node);
|
||||||
|
case "FencedCodeBlock" -> copy((FencedCodeBlock) node);
|
||||||
|
case "HardLineBreak" -> copy((HardLineBreak) node);
|
||||||
|
case "Heading" -> copy((Heading) node);
|
||||||
|
case "HtmlBlock" -> copy((HtmlBlock) node);
|
||||||
|
case "HtmlInline" -> copy((HtmlInline) node);
|
||||||
|
case "Image" -> copy((Image) node);
|
||||||
|
case "IndentedCodeBlock" -> copy((IndentedCodeBlock) node);
|
||||||
|
case "Link" -> copy((Link) node);
|
||||||
|
case "LinkReferenceDefinition" -> copy((LinkReferenceDefinition) node);
|
||||||
|
case "ListItem" -> copy((ListItem) node);
|
||||||
|
case "OrderedList" -> copy((OrderedList) node);
|
||||||
|
case "Paragraph" -> copy((Paragraph) node);
|
||||||
|
case "SoftLineBreak" -> copy((SoftLineBreak) node);
|
||||||
|
case "StrongEmphasis" -> copy((StrongEmphasis) node);
|
||||||
|
case "Text" -> copy((Text) node);
|
||||||
|
case "ThematicBreak" -> copy((ThematicBreak) node);
|
||||||
|
case "TableBlock" -> copy((TableBlock) node);
|
||||||
|
case "TableBody" -> copy((TableBody) node);
|
||||||
|
case "TableCell" -> copy((TableCell) node);
|
||||||
|
case "TableHead" -> copy((TableHead) node);
|
||||||
|
case "TableRow" -> copy((TableRow) node);
|
||||||
|
default -> throw new IllegalArgumentException("No copy method found for class: " + node.getClass().getName());
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BlockQuote copy(BlockQuote blockQuote);
|
||||||
|
|
||||||
|
|
||||||
|
@Mapping(target = "bulletMarker", ignore = true)
|
||||||
|
BulletList copy(BulletList bulletList);
|
||||||
|
|
||||||
|
|
||||||
|
Code copy(Code code);
|
||||||
|
|
||||||
|
|
||||||
|
Document copy(Document document);
|
||||||
|
|
||||||
|
|
||||||
|
@Mapping(target = "delimiter", source = "openingDelimiter")
|
||||||
|
Emphasis copy(Emphasis emphasis);
|
||||||
|
|
||||||
|
|
||||||
|
@Mapping(target = "fenceChar", ignore = true)
|
||||||
|
@Mapping(target = "fenceLength", ignore = true)
|
||||||
|
FencedCodeBlock copy(FencedCodeBlock fencedCodeBlock);
|
||||||
|
|
||||||
|
|
||||||
|
HardLineBreak copy(HardLineBreak hardLineBreak);
|
||||||
|
|
||||||
|
|
||||||
|
Heading copy(Heading heading);
|
||||||
|
|
||||||
|
|
||||||
|
HtmlBlock copy(HtmlBlock htmlBlock);
|
||||||
|
|
||||||
|
|
||||||
|
HtmlInline copy(HtmlInline htmlInline);
|
||||||
|
|
||||||
|
|
||||||
|
Image copy(Image image);
|
||||||
|
|
||||||
|
|
||||||
|
IndentedCodeBlock copy(IndentedCodeBlock indentedCodeBlock);
|
||||||
|
|
||||||
|
|
||||||
|
Link copy(Link link);
|
||||||
|
|
||||||
|
|
||||||
|
LinkReferenceDefinition copy(LinkReferenceDefinition linkReferenceDefinition);
|
||||||
|
|
||||||
|
|
||||||
|
ListItem copy(ListItem listItem);
|
||||||
|
|
||||||
|
|
||||||
|
@Mapping(target = "startNumber", ignore = true)
|
||||||
|
@Mapping(target = "delimiter", ignore = true)
|
||||||
|
OrderedList copy(OrderedList orderedList);
|
||||||
|
|
||||||
|
|
||||||
|
Paragraph copy(Paragraph paragraph);
|
||||||
|
|
||||||
|
|
||||||
|
SoftLineBreak copy(SoftLineBreak softLineBreak);
|
||||||
|
|
||||||
|
|
||||||
|
@Mapping(target = "delimiter", source = "openingDelimiter")
|
||||||
|
StrongEmphasis copy(StrongEmphasis strongEmphasis);
|
||||||
|
|
||||||
|
|
||||||
|
Text copy(Text text);
|
||||||
|
|
||||||
|
|
||||||
|
ThematicBreak copy(ThematicBreak thematicBreak);
|
||||||
|
|
||||||
|
|
||||||
|
TableBlock copy(TableBlock tableBlock);
|
||||||
|
|
||||||
|
|
||||||
|
TableBody copy(TableBody tableBody);
|
||||||
|
|
||||||
|
|
||||||
|
TableCell copy(TableCell tableCell);
|
||||||
|
|
||||||
|
|
||||||
|
TableHead copy(TableHead tableHead);
|
||||||
|
|
||||||
|
|
||||||
|
TableRow copy(TableRow tableRow);
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,65 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||||
|
|
||||||
|
import java.lang.reflect.Field;
|
||||||
|
|
||||||
|
import org.commonmark.node.Node;
|
||||||
|
|
||||||
|
public class ReflectionNodeCopier {
|
||||||
|
|
||||||
|
NodeCopier mapperNodeCopier;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public static Node copyNode(Node node) {
|
||||||
|
|
||||||
|
Node copy = deepCopy(node);
|
||||||
|
copyChildren(node, copy);
|
||||||
|
return copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void copyChildren(Node nodeToCopy, Node copy) {
|
||||||
|
|
||||||
|
Node next;
|
||||||
|
for (Node node = nodeToCopy.getFirstChild(); node != null; node = next) {
|
||||||
|
next = node.getNext();
|
||||||
|
copy.appendChild(copyNode(node));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static <T> T deepCopy(T object) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
Class<?> clazz = object.getClass();
|
||||||
|
T copy = (T) clazz.getDeclaredConstructor().newInstance();
|
||||||
|
|
||||||
|
for (Field field : clazz.getDeclaredFields()) {
|
||||||
|
field.setAccessible(true);
|
||||||
|
Object value = field.get(object);
|
||||||
|
if (isPrimitiveOrWrapper(field.getType()) || field.getType().equals(String.class)) {
|
||||||
|
field.set(copy, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return copy;
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isPrimitiveOrWrapper(Class<?> type) {
|
||||||
|
|
||||||
|
return type.isPrimitive()
|
||||||
|
|| type == Boolean.class
|
||||||
|
|| type == Byte.class
|
||||||
|
|| type == Character.class
|
||||||
|
|| type == Double.class
|
||||||
|
|| type == Float.class
|
||||||
|
|| type == Integer.class
|
||||||
|
|| type == Long.class
|
||||||
|
|| type == Short.class;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -7,6 +7,8 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.google.common.base.Functions;
|
||||||
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
@ -59,7 +61,9 @@ public class FloatFrequencyCounter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
|
return higher.stream()
|
||||||
|
.sorted(Collections.reverseOrder())
|
||||||
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -74,4 +78,16 @@ public class FloatFrequencyCounter {
|
|||||||
return highest;
|
return highest;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getAverage() {
|
||||||
|
|
||||||
|
double sum = countPerValue.keySet()
|
||||||
|
.stream()
|
||||||
|
.mapToDouble(fontSize -> fontSize * countPerValue.get(fontSize)).sum();
|
||||||
|
double count = countPerValue.values()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(Integer::intValue).sum();
|
||||||
|
return sum / count;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,13 +13,13 @@ import lombok.Setter;
|
|||||||
@Setter
|
@Setter
|
||||||
@EqualsAndHashCode
|
@EqualsAndHashCode
|
||||||
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
|
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
|
||||||
public class Boundary implements Comparable<Boundary> {
|
public class TextRange implements Comparable<TextRange> {
|
||||||
|
|
||||||
private int start;
|
private int start;
|
||||||
private int end;
|
private int end;
|
||||||
|
|
||||||
|
|
||||||
public Boundary(int start, int end) {
|
public TextRange(int start, int end) {
|
||||||
|
|
||||||
if (start > end) {
|
if (start > end) {
|
||||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||||
@ -47,15 +47,15 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(Boundary boundary) {
|
public boolean contains(TextRange textRange) {
|
||||||
|
|
||||||
return start <= boundary.start() && boundary.end() <= end;
|
return start <= textRange.start() && textRange.end() <= end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean containedBy(Boundary boundary) {
|
public boolean containedBy(TextRange textRange) {
|
||||||
|
|
||||||
return boundary.contains(this);
|
return textRange.contains(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -83,18 +83,18 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean intersects(Boundary boundary) {
|
public boolean intersects(TextRange textRange) {
|
||||||
|
|
||||||
return boundary.start() < this.end && this.start < boundary.end();
|
return textRange.start() < this.end && this.start < textRange.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<Boundary> split(List<Integer> splitIndices) {
|
public List<TextRange> split(List<Integer> splitIndices) {
|
||||||
|
|
||||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||||
}
|
}
|
||||||
List<Boundary> splitBoundaries = new LinkedList<>();
|
List<TextRange> splitBoundaries = new LinkedList<>();
|
||||||
int previousIndex = start;
|
int previousIndex = start;
|
||||||
for (int splitIndex : splitIndices) {
|
for (int splitIndex : splitIndices) {
|
||||||
|
|
||||||
@ -102,10 +102,10 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
if (splitIndex == previousIndex) {
|
if (splitIndex == previousIndex) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
|
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
|
||||||
previousIndex = splitIndex;
|
previousIndex = splitIndex;
|
||||||
}
|
}
|
||||||
splitBoundaries.add(new Boundary(previousIndex, end));
|
splitBoundaries.add(new TextRange(previousIndex, end));
|
||||||
return splitBoundaries;
|
return splitBoundaries;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,11 +114,11 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
return IntStream.range(start, end);
|
return IntStream.range(start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
public static TextRange merge(Collection<TextRange> boundaries) {
|
||||||
|
|
||||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new);
|
||||||
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new);
|
||||||
return new Boundary(minStart, maxEnd);
|
return new TextRange(minStart, maxEnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -130,12 +130,12 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(Boundary boundary) {
|
public int compareTo(TextRange textRange) {
|
||||||
|
|
||||||
if (end < boundary.end() && start < boundary.start()) {
|
if (end < textRange.end() && start < textRange.start()) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (start > boundary.start() && end > boundary.end()) {
|
if (start > textRange.start() && end > textRange.end()) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -11,7 +11,7 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||||
@ -32,7 +32,7 @@ public class RedactionEntity {
|
|||||||
|
|
||||||
// initial values
|
// initial values
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
final Boundary boundary;
|
final TextRange textRange;
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
final String type;
|
final String type;
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
@ -66,9 +66,9 @@ public class RedactionEntity {
|
|||||||
SemanticNode deepestFullyContainingNode;
|
SemanticNode deepestFullyContainingNode;
|
||||||
|
|
||||||
|
|
||||||
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
|
public static RedactionEntity initialEntityNode(TextRange textRange, String type, EntityType entityType) {
|
||||||
|
|
||||||
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
|
return RedactionEntity.builder().type(type).entityType(entityType).textRange(textRange).engines(new HashSet<>()).references(new HashSet<>()).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -132,7 +132,7 @@ public class RedactionEntity {
|
|||||||
public List<RedactionPosition> getRedactionPositionsPerPage() {
|
public List<RedactionPosition> getRedactionPositionsPerPage() {
|
||||||
|
|
||||||
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
|
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
|
||||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
|
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange);
|
||||||
|
|
||||||
Page firstPage = rectanglesPerLinePerPage.keySet()
|
Page firstPage = rectanglesPerLinePerPage.keySet()
|
||||||
.stream()
|
.stream()
|
||||||
@ -157,19 +157,19 @@ public class RedactionEntity {
|
|||||||
|
|
||||||
public boolean containedBy(RedactionEntity redactionEntity) {
|
public boolean containedBy(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
return this.boundary.containedBy(redactionEntity.getBoundary());
|
return this.textRange.containedBy(redactionEntity.getTextRange());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(RedactionEntity redactionEntity) {
|
public boolean contains(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
return this.boundary.contains(redactionEntity.getBoundary());
|
return this.textRange.contains(redactionEntity.getTextRange());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean intersects(RedactionEntity redactionEntity) {
|
public boolean intersects(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
return this.boundary.intersects(redactionEntity.getBoundary());
|
return this.textRange.intersects(redactionEntity.getTextRange());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -210,7 +210,7 @@ public class RedactionEntity {
|
|||||||
sb.append("Entity[\"");
|
sb.append("Entity[\"");
|
||||||
sb.append(value);
|
sb.append(value);
|
||||||
sb.append("\", ");
|
sb.append("\", ");
|
||||||
sb.append(boundary);
|
sb.append(textRange);
|
||||||
sb.append(", pages[");
|
sb.append(", pages[");
|
||||||
pages.forEach(page -> {
|
pages.forEach(page -> {
|
||||||
sb.append(page.getNumber());
|
sb.append(page.getNumber());
|
||||||
|
|||||||
@ -60,8 +60,8 @@ public class Document extends AbstractSemanticNode {
|
|||||||
*
|
*
|
||||||
* @return A list of main sections within the document
|
* @return A list of main sections within the document
|
||||||
* @deprecated This method is marked for removal.
|
* @deprecated This method is marked for removal.
|
||||||
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
||||||
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
||||||
*/
|
*/
|
||||||
@Deprecated(forRemoval = true)
|
@Deprecated(forRemoval = true)
|
||||||
public List<Section> getMainSections() {
|
public List<Section> getMainSections() {
|
||||||
|
|||||||
@ -14,7 +14,7 @@ import java.util.stream.Stream;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||||
@ -42,7 +42,9 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default TextBlock getTextBlock() {
|
default TextBlock getTextBlock() {
|
||||||
|
|
||||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
|
return streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||||
|
.map(SemanticNode::getTextBlock)
|
||||||
|
.collect(new TextBlockCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -68,7 +70,10 @@ public interface SemanticNode {
|
|||||||
|
|
||||||
default Page getFirstPage() {
|
default Page getFirstPage() {
|
||||||
|
|
||||||
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
return getTextBlock().getPages()
|
||||||
|
.stream()
|
||||||
|
.min(Comparator.comparingInt(Page::getNumber))
|
||||||
|
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -77,18 +82,19 @@ public interface SemanticNode {
|
|||||||
*
|
*
|
||||||
* @return Set of PageNodes this node appears on.
|
* @return Set of PageNodes this node appears on.
|
||||||
*/
|
*/
|
||||||
default Set<Page> getPages(Boundary boundary) {
|
default Set<Page> getPages(TextRange textRange) {
|
||||||
|
|
||||||
if (!getBoundary().contains(boundary)) {
|
if (!getBoundary().contains(textRange)) {
|
||||||
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary()));
|
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", textRange, getBoundary()));
|
||||||
}
|
}
|
||||||
return getTextBlock().getPages(boundary);
|
return getTextBlock().getPages(textRange);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default boolean isOnPage(int pageNumber) {
|
default boolean isOnPage(int pageNumber) {
|
||||||
|
|
||||||
return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber);
|
return getPages().stream()
|
||||||
|
.anyMatch(page -> page.getNumber() == pageNumber);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -203,7 +209,9 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default boolean hasEntitiesOfType(String type) {
|
default boolean hasEntitiesOfType(String type) {
|
||||||
|
|
||||||
return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
|
return getEntities().stream()
|
||||||
|
.filter(entity -> entity.getEntityType().equals(EntityType.ENTITY))
|
||||||
|
.anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -215,7 +223,9 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default List<RedactionEntity> getEntitiesOfType(String type) {
|
default List<RedactionEntity> getEntitiesOfType(String type) {
|
||||||
|
|
||||||
return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList();
|
return getEntities().stream()
|
||||||
|
.filter(redactionEntity -> redactionEntity.getType().equals(type))
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -227,7 +237,9 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default List<RedactionEntity> getEntitiesOfType(List<String> types) {
|
default List<RedactionEntity> getEntitiesOfType(List<String> types) {
|
||||||
|
|
||||||
return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList();
|
return getEntities().stream()
|
||||||
|
.filter(redactionEntity -> redactionEntity.isAnyType(types))
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -241,7 +253,8 @@ public interface SemanticNode {
|
|||||||
|
|
||||||
TextBlock textBlock = getTextBlock();
|
TextBlock textBlock = getTextBlock();
|
||||||
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
|
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
|
||||||
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
|
return getTextBlock().getAtomicTextBlocks()
|
||||||
|
.get(0).getNumberOnPage();
|
||||||
} else {
|
} else {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@ -279,7 +292,8 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default boolean containsStrings(List<String> strings) {
|
default boolean containsStrings(List<String> strings) {
|
||||||
|
|
||||||
return strings.stream().allMatch(this::containsString);
|
return strings.stream()
|
||||||
|
.allMatch(this::containsString);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -303,7 +317,8 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default boolean containsAnyString(List<String> strings) {
|
default boolean containsAnyString(List<String> strings) {
|
||||||
|
|
||||||
return strings.stream().anyMatch(this::containsString);
|
return strings.stream()
|
||||||
|
.anyMatch(this::containsString);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -315,7 +330,8 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default boolean containsAnyStringIgnoreCase(List<String> strings) {
|
default boolean containsAnyStringIgnoreCase(List<String> strings) {
|
||||||
|
|
||||||
return strings.stream().anyMatch(this::containsStringIgnoreCase);
|
return strings.stream()
|
||||||
|
.anyMatch(this::containsStringIgnoreCase);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -328,13 +344,13 @@ public interface SemanticNode {
|
|||||||
default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {
|
default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {
|
||||||
|
|
||||||
TextBlock textBlock = getTextBlock();
|
TextBlock textBlock = getTextBlock();
|
||||||
if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) {
|
if (textBlock.getTextRange().intersects(redactionEntity.getTextRange())) {
|
||||||
if (textBlock.containsBoundary(redactionEntity.getBoundary())) {
|
if (textBlock.containsBoundary(redactionEntity.getTextRange())) {
|
||||||
redactionEntity.setDeepestFullyContainingNode(this);
|
redactionEntity.setDeepestFullyContainingNode(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
redactionEntity.addIntersectingNode(this);
|
redactionEntity.addIntersectingNode(this);
|
||||||
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary()))
|
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getTextRange()))
|
||||||
.forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
|
.forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -386,7 +402,8 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default Stream<SemanticNode> streamAllSubNodes() {
|
default Stream<SemanticNode> streamAllSubNodes() {
|
||||||
|
|
||||||
return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode);
|
return getDocumentTree().allSubEntriesInOrder(getTreeId())
|
||||||
|
.map(DocumentTree.Entry::getNode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -397,7 +414,9 @@ public interface SemanticNode {
|
|||||||
*/
|
*/
|
||||||
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
|
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
|
||||||
|
|
||||||
return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode);
|
return getDocumentTree().allSubEntriesInOrder(getTreeId())
|
||||||
|
.filter(entry -> entry.getType().equals(nodeType))
|
||||||
|
.map(DocumentTree.Entry::getNode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -406,9 +425,9 @@ public interface SemanticNode {
|
|||||||
*
|
*
|
||||||
* @return Boundary of this Node's TextBlock
|
* @return Boundary of this Node's TextBlock
|
||||||
*/
|
*/
|
||||||
default Boundary getBoundary() {
|
default TextRange getBoundary() {
|
||||||
|
|
||||||
return getTextBlock().getBoundary();
|
return getTextBlock().getTextRange();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -454,8 +473,16 @@ public interface SemanticNode {
|
|||||||
private Map<Page, Rectangle2D> getBBoxFromChildren() {
|
private Map<Page, Rectangle2D> getBBoxFromChildren() {
|
||||||
|
|
||||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||||
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList();
|
|
||||||
Set<Page> pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet());
|
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren() //
|
||||||
|
.filter(SemanticNode::isNotOcrImage)
|
||||||
|
.map(SemanticNode::getBBox)
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
Set<Page> pages = childrenBBoxes.stream()
|
||||||
|
.flatMap(map -> map.keySet()
|
||||||
|
.stream())
|
||||||
|
.collect(Collectors.toSet());
|
||||||
for (Page page : pages) {
|
for (Page page : pages) {
|
||||||
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
|
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
|
||||||
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
||||||
@ -467,13 +494,24 @@ public interface SemanticNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isNotOcrImage(SemanticNode node) {
|
||||||
|
|
||||||
|
if (!node.getType().equals(NodeType.IMAGE)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return The union of all BoundingBoxes of the TextBlock of this node
|
* @return The union of all BoundingBoxes of the TextBlock of this node
|
||||||
*/
|
*/
|
||||||
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
|
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
|
||||||
|
|
||||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||||
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||||
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
||||||
return bBoxPerPage;
|
return bBoxPerPage;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -10,10 +10,12 @@ import java.util.Collections;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
@ -36,14 +38,14 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
Page page;
|
Page page;
|
||||||
|
|
||||||
//string coordinates
|
//string coordinates
|
||||||
Boundary boundary;
|
TextRange textRange;
|
||||||
String searchText;
|
String searchText;
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
List<Integer> lineBreaks = new ArrayList<>();
|
List<Integer> lineBreaks = new ArrayList<>();
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
List<Boundary> boldTextBoundaries = new ArrayList<>();
|
List<TextRange> boldTextBoundaries = new ArrayList<>();
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
List<Boundary> italicTextBoundaries = new ArrayList<>();
|
List<TextRange> italicTextBoundaries = new ArrayList<>();
|
||||||
String orientation;
|
String orientation;
|
||||||
int textDirection;
|
int textDirection;
|
||||||
|
|
||||||
@ -64,10 +66,44 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
|
||||||
|
|
||||||
|
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<Integer> lbInBoundary = lineBreaks.stream()
|
||||||
|
.map(i -> i + stringTextRange.start())
|
||||||
|
.filter(stringTextRange::contains)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
if (stringTextRange.end() == getTextRange().end()) {
|
||||||
|
lbInBoundary.add(getTextRange().end());
|
||||||
|
}
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = stringTextRange.start(); i < stringTextRange.end(); i++) {
|
||||||
|
char character = this.charAt(i);
|
||||||
|
if (lbInBoundary.contains(i + 1)) {
|
||||||
|
// always plus one, due to the linebreaks being an exclusive end index
|
||||||
|
if (!Character.isWhitespace(character)) {
|
||||||
|
lbInBoundary.remove(i + 1);
|
||||||
|
lbInBoundary.add(i + 2);
|
||||||
|
sb.append(character);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
sb.append("\n");
|
||||||
|
} else {
|
||||||
|
sb.append(character);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
|
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
|
||||||
List<Integer> lineBreaks,
|
List<Integer> lineBreaks,
|
||||||
List<Boundary> boldTextBoundaries,
|
List<TextRange> boldTextBoundaries,
|
||||||
List<Boundary> italicTextBoundaries,
|
List<TextRange> italicTextBoundaries,
|
||||||
List<Rectangle2D> positions,
|
List<Rectangle2D> positions,
|
||||||
List<Integer> stringIdxToPositionIdx,
|
List<Integer> stringIdxToPositionIdx,
|
||||||
long idx,
|
long idx,
|
||||||
@ -89,7 +125,7 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
.italicTextBoundaries(italicTextBoundaries)
|
.italicTextBoundaries(italicTextBoundaries)
|
||||||
.positions(positions)
|
.positions(positions)
|
||||||
.stringIdxToPositionIdx(stringIdxToPositionIdx)
|
.stringIdxToPositionIdx(stringIdxToPositionIdx)
|
||||||
.boundary(new Boundary(offset, offset + searchText.length()))
|
.textRange(new TextRange(offset, offset + searchText.length()))
|
||||||
.textDirection(textDirection)
|
.textDirection(textDirection)
|
||||||
.orientation(orientation)
|
.orientation(orientation)
|
||||||
.build();
|
.build();
|
||||||
@ -100,7 +136,7 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
|
|
||||||
return AtomicTextBlock.builder()
|
return AtomicTextBlock.builder()
|
||||||
.id(textBlockIdx)
|
.id(textBlockIdx)
|
||||||
.boundary(new Boundary(stringOffset, stringOffset))
|
.textRange(new TextRange(stringOffset, stringOffset))
|
||||||
.searchText("")
|
.searchText("")
|
||||||
.page(page)
|
.page(page)
|
||||||
.numberOnPage(numberOnPage)
|
.numberOnPage(numberOnPage)
|
||||||
@ -109,19 +145,18 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
|
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {
|
||||||
DocumentPositionData documentPositionData,
|
|
||||||
SemanticNode parent,
|
|
||||||
Page page) {
|
|
||||||
|
|
||||||
return AtomicTextBlock.builder()
|
return AtomicTextBlock.builder()
|
||||||
.id(documentTextData.getId())
|
.id(documentTextData.getId())
|
||||||
.numberOnPage(documentTextData.getNumberOnPage())
|
.numberOnPage(documentTextData.getNumberOnPage())
|
||||||
.page(page)
|
.page(page)
|
||||||
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
|
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
|
||||||
.searchText(documentTextData.getSearchText())
|
.searchText(documentTextData.getSearchText())
|
||||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
|
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed()
|
||||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
|
.toList())
|
||||||
|
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed()
|
||||||
|
.toList())
|
||||||
.positions(toRectangle2DList(documentPositionData.getPositions()))
|
.positions(toRectangle2DList(documentPositionData.getPositions()))
|
||||||
.parent(parent)
|
.parent(parent)
|
||||||
.build();
|
.build();
|
||||||
@ -130,7 +165,9 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
|
|
||||||
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
||||||
|
|
||||||
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
|
return Arrays.stream(positions)
|
||||||
|
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -140,11 +177,11 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
||||||
}
|
}
|
||||||
if (lineNumber == 0) {
|
if (lineNumber == 0) {
|
||||||
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
|
return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start());
|
||||||
} else if (lineNumber == numberOfLines() - 1) {
|
} else if (lineNumber == numberOfLines() - 1) {
|
||||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
|
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
|
||||||
}
|
}
|
||||||
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
|
return subSequence(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -159,9 +196,9 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
public int getNextLinebreak(int fromIndex) {
|
public int getNextLinebreak(int fromIndex) {
|
||||||
|
|
||||||
return lineBreaks.stream()//
|
return lineBreaks.stream()//
|
||||||
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
|
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
|
||||||
.findFirst() //
|
.findFirst() //
|
||||||
.orElse(searchText.length()) + boundary.start();
|
.orElse(searchText.length()) + textRange.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -169,43 +206,43 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
public int getPreviousLinebreak(int fromIndex) {
|
public int getPreviousLinebreak(int fromIndex) {
|
||||||
|
|
||||||
return lineBreaks.stream()//
|
return lineBreaks.stream()//
|
||||||
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
|
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
|
||||||
.reduce((a, b) -> b)//
|
.reduce((a, b) -> b)//
|
||||||
.orElse(0) + boundary.start();
|
.orElse(0) + textRange.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Rectangle2D getPosition(int stringIdx) {
|
public Rectangle2D getPosition(int stringIdx) {
|
||||||
|
|
||||||
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
|
return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
|
||||||
|
|
||||||
if (!containsBoundary(stringBoundary)) {
|
if (!containsBoundary(stringTextRange)) {
|
||||||
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary));
|
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange));
|
||||||
}
|
}
|
||||||
if (stringBoundary.length() == 0) {
|
if (stringTextRange.length() == 0) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start());
|
int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start());
|
||||||
|
|
||||||
if (stringBoundary.end() == this.boundary.end()) {
|
if (stringTextRange.end() == this.textRange.end()) {
|
||||||
return positions.subList(startPositionIdx, positions.size());
|
return positions.subList(startPositionIdx, positions.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
|
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start()));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
|
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
|
||||||
|
|
||||||
List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
|
List<Rectangle2D> rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange))
|
||||||
.stream()
|
.stream()
|
||||||
.map(this::getPositions)
|
.map(this::getPositions)
|
||||||
.map(RectangleTransformations::rectangleBBoxWithGaps)
|
.map(RectangleTransformations::rectangleBBoxWithGaps)
|
||||||
@ -217,9 +254,12 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Integer> getAllLineBreaksInBoundary(Boundary boundary) {
|
private List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
|
||||||
|
|
||||||
return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList();
|
return getLineBreaks().stream()
|
||||||
|
.map(linebreak -> linebreak + this.textRange.start())
|
||||||
|
.filter(textRange::contains)
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
@ -24,7 +24,7 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
List<AtomicTextBlock> atomicTextBlocks;
|
List<AtomicTextBlock> atomicTextBlocks;
|
||||||
String searchText;
|
String searchText;
|
||||||
Boundary boundary;
|
TextRange textRange;
|
||||||
|
|
||||||
|
|
||||||
public static ConcatenatedTextBlock empty() {
|
public static ConcatenatedTextBlock empty() {
|
||||||
@ -37,29 +37,30 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
this.atomicTextBlocks = new LinkedList<>();
|
this.atomicTextBlocks = new LinkedList<>();
|
||||||
if (atomicTextBlocks.isEmpty()) {
|
if (atomicTextBlocks.isEmpty()) {
|
||||||
boundary = new Boundary(-1, -1);
|
textRange = new TextRange(-1, -1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
var firstTextBlock = atomicTextBlocks.get(0);
|
var firstTextBlock = atomicTextBlocks.get(0);
|
||||||
this.atomicTextBlocks.add(firstTextBlock);
|
this.atomicTextBlocks.add(firstTextBlock);
|
||||||
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
|
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
|
||||||
|
|
||||||
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
|
atomicTextBlocks.subList(1, atomicTextBlocks.size())
|
||||||
|
.forEach(this::concat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
||||||
|
|
||||||
int start = textBlock.getBoundary().start();
|
int start = textBlock.getTextRange().start();
|
||||||
int end = textBlock.getBoundary().end();
|
int end = textBlock.getTextRange().end();
|
||||||
if (this.atomicTextBlocks.isEmpty()) {
|
if (this.atomicTextBlocks.isEmpty()) {
|
||||||
boundary.setStart(start);
|
textRange.setStart(start);
|
||||||
boundary.setEnd(end);
|
textRange.setEnd(end);
|
||||||
} else if (boundary.end() != start) {
|
} else if (textRange.end() != start) {
|
||||||
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
|
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange()));
|
||||||
}
|
}
|
||||||
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
||||||
boundary.setEnd(end);
|
textRange.setEnd(end);
|
||||||
this.searchText = null;
|
this.searchText = null;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
@ -67,13 +68,18 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
||||||
|
|
||||||
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
return atomicTextBlocks.stream()
|
||||||
|
.filter(textBlock -> textBlock.getTextRange().contains(stringIdx))
|
||||||
|
.findAny()
|
||||||
|
.orElseThrow(IndexOutOfBoundsException::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
|
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
|
||||||
|
|
||||||
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
|
return atomicTextBlocks.stream()
|
||||||
|
.filter(tb -> tb.getTextRange().intersects(textRange))
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -92,7 +98,9 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
@Override
|
@Override
|
||||||
public int numberOfLines() {
|
public int numberOfLines() {
|
||||||
|
|
||||||
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
|
return atomicTextBlocks.stream()
|
||||||
|
.map(AtomicTextBlock::getLineBreaks)
|
||||||
|
.mapToInt(List::size).sum();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -113,7 +121,10 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
@Override
|
@Override
|
||||||
public List<Integer> getLineBreaks() {
|
public List<Integer> getLineBreaks() {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
|
return getAtomicTextBlocks().stream()
|
||||||
|
.flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks()
|
||||||
|
.stream())
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -125,47 +136,48 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
|
||||||
|
|
||||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||||
|
|
||||||
if (textBlocks.size() == 1) {
|
if (textBlocks.size() == 1) {
|
||||||
return textBlocks.get(0).getPositions(stringBoundary);
|
return textBlocks.get(0).getPositions(stringTextRange);
|
||||||
}
|
}
|
||||||
|
|
||||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||||
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
|
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
|
||||||
|
|
||||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||||
positions.addAll(textBlock.getPositions());
|
positions.addAll(textBlock.getPositions());
|
||||||
}
|
}
|
||||||
|
|
||||||
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||||
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
|
||||||
|
|
||||||
return positions;
|
return positions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
|
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
|
||||||
|
|
||||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||||
|
|
||||||
if (textBlocks.size() == 1) {
|
if (textBlocks.size() == 1) {
|
||||||
return textBlocks.get(0).getPositionsPerPage(stringBoundary);
|
return textBlocks.get(0).getPositionsPerPage(stringTextRange);
|
||||||
}
|
}
|
||||||
|
|
||||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()));
|
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()));
|
||||||
|
|
||||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary()));
|
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange()));
|
||||||
}
|
}
|
||||||
|
|
||||||
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
|
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
|
||||||
lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(),
|
||||||
|
stringTextRange.end())));
|
||||||
|
|
||||||
return rectanglesPerLinePerPage;
|
return rectanglesPerLinePerPage;
|
||||||
}
|
}
|
||||||
@ -174,11 +186,42 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
|
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
|
||||||
|
|
||||||
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
|
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
|
||||||
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList()));
|
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode,
|
||||||
|
rectangles,
|
||||||
|
(l1, l2) -> Stream.concat(l1.stream(), l2.stream())
|
||||||
|
.toList()));
|
||||||
return mergedMap;
|
return mergedMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String subSequenceWithLineBreaks(TextRange stringTextRange) {
|
||||||
|
|
||||||
|
if (stringTextRange.length() == 0 || !getTextRange().contains(stringTextRange)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||||
|
|
||||||
|
if (textBlocks.size() == 1) {
|
||||||
|
return textBlocks.get(0).subSequenceWithLineBreaks(stringTextRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||||
|
sb.append(firstTextBlock.subSequenceWithLineBreaks(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
|
||||||
|
|
||||||
|
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||||
|
sb.append(textBlock.searchTextWithLineBreaks());
|
||||||
|
}
|
||||||
|
|
||||||
|
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||||
|
sb.append(lastTextBlock.subSequenceWithLineBreaks(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
@ -187,16 +230,22 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Boundary> getBoldTextBoundaries() {
|
public List<TextRange> getBoldTextBoundaries() {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
|
return getAtomicTextBlocks().stream()
|
||||||
|
.map(AtomicTextBlock::getBoldTextBoundaries)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Boundary> getItalicTextBoundaries() {
|
public List<TextRange> getItalicTextBoundaries() {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
|
return getAtomicTextBlocks().stream()
|
||||||
|
.map(AtomicTextBlock::getItalicTextBoundaries)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,7 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
|
|
||||||
public interface TextBlock extends CharSequence {
|
public interface TextBlock extends CharSequence {
|
||||||
@ -21,10 +21,10 @@ public interface TextBlock extends CharSequence {
|
|||||||
List<AtomicTextBlock> getAtomicTextBlocks();
|
List<AtomicTextBlock> getAtomicTextBlocks();
|
||||||
|
|
||||||
|
|
||||||
List<Boundary> getBoldTextBoundaries();
|
List<TextRange> getBoldTextBoundaries();
|
||||||
|
|
||||||
|
|
||||||
List<Boundary> getItalicTextBoundaries();
|
List<TextRange> getItalicTextBoundaries();
|
||||||
|
|
||||||
|
|
||||||
String getOrientation();
|
String getOrientation();
|
||||||
@ -33,7 +33,7 @@ public interface TextBlock extends CharSequence {
|
|||||||
int getTextDirection();
|
int getTextDirection();
|
||||||
|
|
||||||
|
|
||||||
Boundary getBoundary();
|
TextRange getTextRange();
|
||||||
|
|
||||||
|
|
||||||
int getNextLinebreak(int fromIndex);
|
int getNextLinebreak(int fromIndex);
|
||||||
@ -48,31 +48,41 @@ public interface TextBlock extends CharSequence {
|
|||||||
Rectangle2D getPosition(int stringIdx);
|
Rectangle2D getPosition(int stringIdx);
|
||||||
|
|
||||||
|
|
||||||
List<Rectangle2D> getPositions(Boundary stringBoundary);
|
List<Rectangle2D> getPositions(TextRange stringTextRange);
|
||||||
|
|
||||||
|
|
||||||
Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary);
|
Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange);
|
||||||
|
|
||||||
|
|
||||||
int numberOfLines();
|
int numberOfLines();
|
||||||
|
|
||||||
|
|
||||||
|
String subSequenceWithLineBreaks(TextRange stringTextRange);
|
||||||
|
|
||||||
|
|
||||||
|
default String searchTextWithLineBreaks() {
|
||||||
|
|
||||||
|
return subSequenceWithLineBreaks(getTextRange());
|
||||||
|
}
|
||||||
|
|
||||||
default int indexOf(String searchTerm) {
|
default int indexOf(String searchTerm) {
|
||||||
|
|
||||||
return indexOf(searchTerm, getBoundary().start());
|
return indexOf(searchTerm, getTextRange().start());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default Set<Page> getPages() {
|
default Set<Page> getPages() {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
|
return getAtomicTextBlocks().stream()
|
||||||
|
.map(AtomicTextBlock::getPage)
|
||||||
|
.collect(Collectors.toUnmodifiableSet());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default Set<Page> getPages(Boundary boundary) {
|
default Set<Page> getPages(TextRange textRange) {
|
||||||
|
|
||||||
return getAtomicTextBlocks().stream()
|
return getAtomicTextBlocks().stream()
|
||||||
.filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary))
|
.filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange))
|
||||||
.map(AtomicTextBlock::getPage)
|
.map(AtomicTextBlock::getPage)
|
||||||
.collect(Collectors.toUnmodifiableSet());
|
.collect(Collectors.toUnmodifiableSet());
|
||||||
}
|
}
|
||||||
@ -80,38 +90,38 @@ public interface TextBlock extends CharSequence {
|
|||||||
|
|
||||||
default int indexOf(String searchTerm, int startOffset) {
|
default int indexOf(String searchTerm, int startOffset) {
|
||||||
|
|
||||||
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
|
int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start());
|
||||||
if (start == -1) {
|
if (start == -1) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
return start + getBoundary().start();
|
return start + getTextRange().start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default CharSequence getFirstLine() {
|
default CharSequence getFirstLine() {
|
||||||
|
|
||||||
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
|
return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default boolean containsBoundary(Boundary boundary) {
|
default boolean containsBoundary(TextRange textRange) {
|
||||||
|
|
||||||
if (boundary.end() < boundary.start()) {
|
if (textRange.end() < textRange.start()) {
|
||||||
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
|
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange));
|
||||||
}
|
}
|
||||||
return getBoundary().contains(boundary);
|
return getTextRange().contains(textRange);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default boolean containsIndex(int stringIndex) {
|
default boolean containsIndex(int stringIndex) {
|
||||||
|
|
||||||
return getBoundary().contains(stringIndex);
|
return getTextRange().contains(stringIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
default CharSequence subSequence(Boundary boundary) {
|
default CharSequence subSequence(TextRange textRange) {
|
||||||
|
|
||||||
return subSequence(boundary.start(), boundary.end());
|
return subSequence(textRange.start(), textRange.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -128,21 +138,21 @@ public interface TextBlock extends CharSequence {
|
|||||||
@Override
|
@Override
|
||||||
default CharSequence subSequence(int start, int end) {
|
default CharSequence subSequence(int start, int end) {
|
||||||
|
|
||||||
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
|
return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
default int length() {
|
default int length() {
|
||||||
|
|
||||||
return getBoundary().length();
|
return getTextRange().length();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
default char charAt(int index) {
|
default char charAt(int index) {
|
||||||
|
|
||||||
return getSearchText().charAt(index - getBoundary().start());
|
return getSearchText().charAt(index - getTextRange().start());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
|
|||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
@ -19,8 +19,8 @@ public class SearchTextWithTextPositionDto {
|
|||||||
String searchText;
|
String searchText;
|
||||||
List<Integer> lineBreaks;
|
List<Integer> lineBreaks;
|
||||||
List<Integer> stringIdxToPositionIdx;
|
List<Integer> stringIdxToPositionIdx;
|
||||||
List<Boundary> boldTextBoundaries;
|
List<TextRange> boldTextBoundaries;
|
||||||
List<Boundary> italicTextBoundaries;
|
List<TextRange> italicTextBoundaries;
|
||||||
List<Rectangle2D> positions;
|
List<Rectangle2D> positions;
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import java.util.List;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
@ -118,23 +118,23 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
|
private static List<TextRange> mergeToBoundaries(List<Integer> integers) {
|
||||||
|
|
||||||
if (integers.isEmpty()) {
|
if (integers.isEmpty()) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
List<Boundary> boundaries = new LinkedList<>();
|
List<TextRange> boundaries = new LinkedList<>();
|
||||||
int start = integers.get(0);
|
int start = integers.get(0);
|
||||||
int end = integers.get(0) + 1;
|
int end = integers.get(0) + 1;
|
||||||
for (int current : integers) {
|
for (int current : integers) {
|
||||||
if (current > end + 1) {
|
if (current > end + 1) {
|
||||||
boundaries.add(new Boundary(start, end));
|
boundaries.add(new TextRange(start, end));
|
||||||
start = current;
|
start = current;
|
||||||
}
|
}
|
||||||
end = current + 1;
|
end = current + 1;
|
||||||
}
|
}
|
||||||
if (boundaries.isEmpty()) {
|
if (boundaries.isEmpty()) {
|
||||||
boundaries.add(new Boundary(start, end));
|
boundaries.add(new TextRange(start, end));
|
||||||
}
|
}
|
||||||
return boundaries;
|
return boundaries;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -116,8 +116,8 @@ public class DocumentDataMapper {
|
|||||||
.page(atomicTextBlock.getPage().getNumber().longValue())
|
.page(atomicTextBlock.getPage().getNumber().longValue())
|
||||||
.searchText(atomicTextBlock.getSearchText())
|
.searchText(atomicTextBlock.getSearchText())
|
||||||
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
||||||
.start(atomicTextBlock.getBoundary().start())
|
.start(atomicTextBlock.getTextRange().start())
|
||||||
.end(atomicTextBlock.getBoundary().end())
|
.end(atomicTextBlock.getTextRange().end())
|
||||||
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
|
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
@ -82,15 +82,15 @@ public class TaasDocumentDataMapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Range toRange(Boundary boundary) {
|
private static Range toRange(TextRange textRange) {
|
||||||
|
|
||||||
return new Range(boundary.start(), boundary.end());
|
return new Range(textRange.start(), textRange.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Range> toRange(List<Boundary> boundary) {
|
private static List<Range> toRange(List<TextRange> textRange) {
|
||||||
|
|
||||||
return boundary.stream().map(TaasDocumentDataMapper::toRange).toList();
|
return textRange.stream().map(TaasDocumentDataMapper::toRange).toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,30 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline.buildRenderer;
|
||||||
|
|
||||||
|
import org.commonmark.node.Node;
|
||||||
|
import org.commonmark.renderer.markdown.MarkdownRenderer;
|
||||||
|
|
||||||
|
import com.didalgo.gpt3.Encoding;
|
||||||
|
import com.didalgo.gpt3.GPT3Tokenizer;
|
||||||
|
import com.didalgo.gpt3.TokenCount;
|
||||||
|
|
||||||
|
public class TokenCounter {
|
||||||
|
|
||||||
|
private static final GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE);
|
||||||
|
|
||||||
|
|
||||||
|
public static int countTokens(Node node) {
|
||||||
|
|
||||||
|
MarkdownRenderer renderer = buildRenderer();
|
||||||
|
String markdownResult = renderer.render(node);
|
||||||
|
return countTokens(markdownResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static synchronized int countTokens(String text) {
|
||||||
|
|
||||||
|
return TokenCount.fromString(text, tokenizer);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -10,6 +10,7 @@ import org.springframework.amqp.core.Message;
|
|||||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
|
import org.springframework.boot.actuate.logging.LogFileWebEndpoint;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
@ -18,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -29,6 +31,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
public class MessageHandler {
|
public class MessageHandler {
|
||||||
|
|
||||||
private final LayoutParsingPipeline layoutParsingPipeline;
|
private final LayoutParsingPipeline layoutParsingPipeline;
|
||||||
|
private final MarkdownParsingPipeline markdownParsingPipeline;
|
||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
private final RabbitTemplate rabbitTemplate;
|
private final RabbitTemplate rabbitTemplate;
|
||||||
private final static String X_PIPELINE_PREFIX = "X-PIPE-";
|
private final static String X_PIPELINE_PREFIX = "X-PIPE-";
|
||||||
@ -41,30 +44,30 @@ public class MessageHandler {
|
|||||||
|
|
||||||
LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class);
|
LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class);
|
||||||
|
|
||||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND) && layoutParsingRequest.researchDocumentStorageId() == null) {
|
|
||||||
throw new IllegalArgumentException("ResearchDocumentDataStorageId is null!");
|
|
||||||
}
|
|
||||||
log.info("Layout parsing request received {}", layoutParsingRequest.identifier());
|
log.info("Layout parsing request received {}", layoutParsingRequest.identifier());
|
||||||
if (message.getMessageProperties().isRedelivered()) {
|
if (message.getMessageProperties().isRedelivered()) {
|
||||||
throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.",
|
throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.",
|
||||||
layoutParsingRequest.identifier()));
|
layoutParsingRequest.identifier()));
|
||||||
|
}
|
||||||
|
LayoutParsingFinishedEvent layoutParsingFinishedEvent;
|
||||||
|
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.MARKDOWN)) {
|
||||||
|
layoutParsingFinishedEvent = markdownParsingPipeline.parseMarkdownAndSaveToStorage(layoutParsingRequest);
|
||||||
|
} else {
|
||||||
|
layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
}
|
}
|
||||||
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
|
||||||
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent, message);
|
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent, message);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent, Message message) {
|
public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent, Message message) {
|
||||||
|
|
||||||
Arrays.stream(layoutParsingFinishedEvent.message().split("\n")).forEach(log::info);
|
Arrays.stream(layoutParsingFinishedEvent.message().split("\n"))
|
||||||
|
.forEach(log::info);
|
||||||
rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent, m -> {
|
rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent, m -> {
|
||||||
var forwardHeaders = message.getMessageProperties()
|
var forwardHeaders = message.getMessageProperties().getHeaders().entrySet()
|
||||||
.getHeaders()
|
|
||||||
.entrySet()
|
|
||||||
.stream()
|
.stream()
|
||||||
.filter(e -> e.getKey().toUpperCase(Locale.ROOT).startsWith(X_PIPELINE_PREFIX))
|
.filter(e -> e.getKey().toUpperCase(Locale.ROOT).startsWith(X_PIPELINE_PREFIX))
|
||||||
.collect(Collectors.toMap(Map.Entry::getKey,
|
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
|
||||||
Map.Entry::getValue));
|
|
||||||
m.getMessageProperties().getHeaders().putAll(forwardHeaders);
|
m.getMessageProperties().getHeaders().putAll(forwardHeaders);
|
||||||
return m;
|
return m;
|
||||||
});
|
});
|
||||||
|
|||||||
@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testLayoutParserEndToEnd() {
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
String filePath = "files/new/crafted document.pdf";
|
||||||
|
|
||||||
runForFile(filePath);
|
runForFile(filePath);
|
||||||
}
|
}
|
||||||
@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testLayoutParserEndToEndWithFolder() {
|
public void testLayoutParserEndToEndWithFolder() {
|
||||||
|
|
||||||
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
|
String folder = "/home/kschuettler/Dokumente/TestFiles/syn-dm-single-digit-headlines";
|
||||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||||
.sorted(Comparator.comparing(Path::getFileName))
|
.sorted(Comparator.comparing(Path::getFileName))
|
||||||
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
file = new File(filePath);
|
file = new File(filePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.DOCUMINE_OLD, true);
|
||||||
prepareStorage(layoutParsingRequest, file);
|
prepareStorage(layoutParsingRequest, file);
|
||||||
|
|
||||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
@ -79,9 +79,11 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
.forEach(log::info);
|
.forEach(log::info);
|
||||||
|
|
||||||
File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
|
File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
|
||||||
|
File markdownTmpFile = new File("/tmp/layoutparserEND2END/" + fileName + ".md");
|
||||||
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
|
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
|
||||||
|
|
||||||
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
|
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
|
||||||
|
storageService.downloadTo(TENANT_ID, layoutParsingRequest.markdownDocumentStorageId(), markdownTmpFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,54 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||||
|
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class MarkdownParsingPipelineTest {
|
||||||
|
|
||||||
|
static String TENANT = "tenant";
|
||||||
|
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||||
|
FileSystemBackedStorageService storageService = new FileSystemBackedStorageService(mapper);
|
||||||
|
MarkdownParsingPipeline markdownParsingPipeline = new MarkdownParsingPipeline(storageService);
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void parseMarkdownsFromFolder() {
|
||||||
|
|
||||||
|
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/confluence_dump/");
|
||||||
|
Files.walk(file)
|
||||||
|
.filter(path -> path.getFileName().toFile().toString().endsWith(".md"))
|
||||||
|
.peek(System.out::println)
|
||||||
|
.forEach(this::parseMarkdown);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void parseMarkdown(Path file) {
|
||||||
|
|
||||||
|
LayoutParsingRequest layoutParsingRequest = AbstractTest.buildDefaultLayoutParsingRequest(file.getFileName().toFile().toString(), LayoutParsingType.MARKDOWN, true);
|
||||||
|
|
||||||
|
try (var in = new FileInputStream(file.toFile())) {
|
||||||
|
storageService.storeObject(TENANT, layoutParsingRequest.originFileStorageId(), in);
|
||||||
|
}
|
||||||
|
|
||||||
|
markdownParsingPipeline.parseMarkdownAndSaveToStorage(layoutParsingRequest);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,71 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
|
||||||
|
|
||||||
class BoundaryTest {
|
|
||||||
|
|
||||||
Boundary startBoundary;
|
|
||||||
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
void setUp() {
|
|
||||||
|
|
||||||
startBoundary = new Boundary(10, 100);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testContains() {
|
|
||||||
|
|
||||||
assertTrue(startBoundary.contains(11));
|
|
||||||
assertTrue(startBoundary.contains(50));
|
|
||||||
assertFalse(startBoundary.contains(9));
|
|
||||||
assertFalse(startBoundary.contains(100));
|
|
||||||
assertFalse(startBoundary.contains(150));
|
|
||||||
assertFalse(startBoundary.contains(-123));
|
|
||||||
assertTrue(startBoundary.contains(new Boundary(11, 99)));
|
|
||||||
assertTrue(startBoundary.contains(new Boundary(10, 100)));
|
|
||||||
assertTrue(startBoundary.contains(new Boundary(11, 11)));
|
|
||||||
assertFalse(startBoundary.contains(9, 100));
|
|
||||||
assertTrue(startBoundary.contains(100, 100));
|
|
||||||
assertFalse(startBoundary.contains(100, 101));
|
|
||||||
assertFalse(startBoundary.contains(150, 151));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testIntersects() {
|
|
||||||
|
|
||||||
assertTrue(startBoundary.intersects(new Boundary(1, 11)));
|
|
||||||
assertTrue(startBoundary.intersects(new Boundary(11, 12)));
|
|
||||||
assertTrue(startBoundary.intersects(new Boundary(11, 100)));
|
|
||||||
assertFalse(startBoundary.intersects(new Boundary(100, 101)));
|
|
||||||
assertTrue(startBoundary.intersects(new Boundary(99, 101)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testSplit() {
|
|
||||||
|
|
||||||
assertEquals(4, startBoundary.split(List.of(12, 40, 90)).size());
|
|
||||||
assertEquals(List.of(new Boundary(10, 12), new Boundary(12, 40), new Boundary(40, 90), new Boundary(90, 100)), startBoundary.split(List.of(12, 40, 90)));
|
|
||||||
assertEquals(List.of(new Boundary(10, 40), new Boundary(40, 100)), startBoundary.split(List.of(40)));
|
|
||||||
assertEquals(1, startBoundary.split(Collections.emptyList()).size());
|
|
||||||
assertEquals(1, startBoundary.split(List.of(startBoundary.start())).size());
|
|
||||||
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(0)));
|
|
||||||
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(100)));
|
|
||||||
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(List.of(12, 40, 100)));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -0,0 +1,71 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||||
|
|
||||||
|
class TextRangeTest {
|
||||||
|
|
||||||
|
TextRange startTextRange;
|
||||||
|
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() {
|
||||||
|
|
||||||
|
startTextRange = new TextRange(10, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testContains() {
|
||||||
|
|
||||||
|
assertTrue(startTextRange.contains(11));
|
||||||
|
assertTrue(startTextRange.contains(50));
|
||||||
|
assertFalse(startTextRange.contains(9));
|
||||||
|
assertFalse(startTextRange.contains(100));
|
||||||
|
assertFalse(startTextRange.contains(150));
|
||||||
|
assertFalse(startTextRange.contains(-123));
|
||||||
|
assertTrue(startTextRange.contains(new TextRange(11, 99)));
|
||||||
|
assertTrue(startTextRange.contains(new TextRange(10, 100)));
|
||||||
|
assertTrue(startTextRange.contains(new TextRange(11, 11)));
|
||||||
|
assertFalse(startTextRange.contains(9, 100));
|
||||||
|
assertTrue(startTextRange.contains(100, 100));
|
||||||
|
assertFalse(startTextRange.contains(100, 101));
|
||||||
|
assertFalse(startTextRange.contains(150, 151));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testIntersects() {
|
||||||
|
|
||||||
|
assertTrue(startTextRange.intersects(new TextRange(1, 11)));
|
||||||
|
assertTrue(startTextRange.intersects(new TextRange(11, 12)));
|
||||||
|
assertTrue(startTextRange.intersects(new TextRange(11, 100)));
|
||||||
|
assertFalse(startTextRange.intersects(new TextRange(100, 101)));
|
||||||
|
assertTrue(startTextRange.intersects(new TextRange(99, 101)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testSplit() {
|
||||||
|
|
||||||
|
assertEquals(4, startTextRange.split(List.of(12, 40, 90)).size());
|
||||||
|
assertEquals(List.of(new TextRange(10, 12), new TextRange(12, 40), new TextRange(40, 90), new TextRange(90, 100)), startTextRange.split(List.of(12, 40, 90)));
|
||||||
|
assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40)));
|
||||||
|
assertEquals(1, startTextRange.split(Collections.emptyList()).size());
|
||||||
|
assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size());
|
||||||
|
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0)));
|
||||||
|
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100)));
|
||||||
|
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -41,6 +41,7 @@ import lombok.SneakyThrows;
|
|||||||
@Import(AbstractTest.TestConfiguration.class)
|
@Import(AbstractTest.TestConfiguration.class)
|
||||||
public abstract class AbstractTest {
|
public abstract class AbstractTest {
|
||||||
|
|
||||||
|
public static final String MARKDOWN_FILE_ID = "markdown";
|
||||||
@Autowired
|
@Autowired
|
||||||
protected LayoutParsingStorageService layoutParsingStorageService;
|
protected LayoutParsingStorageService layoutParsingStorageService;
|
||||||
|
|
||||||
@ -105,7 +106,7 @@ public abstract class AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||||
|
|
||||||
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
||||||
return LayoutParsingRequest.builder()
|
return LayoutParsingRequest.builder()
|
||||||
@ -121,6 +122,7 @@ public abstract class AbstractTest {
|
|||||||
.pageFileStorageId(fileName + PAGES_FILE_ID)
|
.pageFileStorageId(fileName + PAGES_FILE_ID)
|
||||||
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
|
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
|
||||||
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
|
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
|
||||||
|
.markdownDocumentStorageId(fileName + MARKDOWN_FILE_ID)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@ -1,5 +1,9 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
dir=${PWD##*/}
|
dir=${PWD##*/}
|
||||||
|
|
||||||
gradle assemble
|
gradle assemble
|
||||||
|
|
||||||
# Get the current Git branch
|
# Get the current Git branch
|
||||||
@ -11,5 +15,32 @@ commit_hash=$(git rev-parse --short=5 HEAD)
|
|||||||
# Combine branch and commit hash
|
# Combine branch and commit hash
|
||||||
buildName="${USER}-${branch}-${commit_hash}"
|
buildName="${USER}-${branch}-${commit_hash}"
|
||||||
|
|
||||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName}
|
||||||
echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName"
|
|
||||||
|
newImageName="nexus.knecon.com:5001/ff/layoutparser-service-server:${buildName}"
|
||||||
|
|
||||||
|
echo "full image name:"
|
||||||
|
echo ${newImageName}
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [ -z "$1" ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
namespace=${1}
|
||||||
|
deployment_name="layoutparser-service"
|
||||||
|
|
||||||
|
echo "deploying to ${namespace}"
|
||||||
|
|
||||||
|
oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}')
|
||||||
|
|
||||||
|
if [ "${newImageName}" = "${oldImageName}" ]; then
|
||||||
|
echo "Image tag of ${deployment_name} did not change, redeploying..."
|
||||||
|
rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace}
|
||||||
|
else
|
||||||
|
echo "upgrading the image tag of ${deployment_name}..."
|
||||||
|
rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace}
|
||||||
|
fi
|
||||||
|
|
||||||
|
rancher kubectl rollout status deployment ${deployment_name} -n ${namespace}
|
||||||
|
echo "Deployed ${deployment_name}:${buildName} to ${namespace}"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user