diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/IndexData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/IndexData.java new file mode 100644 index 0000000..8040a4a --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/IndexData.java @@ -0,0 +1,11 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.clarifynd; + +import java.util.List; +import java.util.Map; + +public class IndexData { + + Map identifier; + List textChunks; + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/TextChunk.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/TextChunk.java new file mode 100644 index 0000000..e5773df --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/clarifynd/TextChunk.java @@ -0,0 +1,18 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.clarifynd; + +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.experimental.FieldDefaults; + +@Builder +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class TextChunk { + + String text; +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java index f914d29..fe437ee 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java @@ -8,5 +8,6 @@ public enum LayoutParsingType { DOCUMINE, DOCUMINE_OLD, CLARIFYND, - CLARIFYND_PARAGRAPH_DEBUG + CLARIFYND_PARAGRAPH_DEBUG, + MARKDOWN } diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index ed30bd3..b640658 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -26,4 +26,10 @@ dependencies { implementation("org.springframework.boot:spring-boot-starter-web:3.1.3") implementation("org.jgrapht:jgrapht-core:1.5.2") implementation("org.tinspin:tinspin-indexes:2.1.3") + implementation("org.commonmark:commonmark:0.22.0") + implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0") + implementation("com.didalgo:gpt3-tokenizer:0.1.8") + + implementation("org.mapstruct:mapstruct:1.5.5.Final") + annotationProcessor("org.mapstruct:mapstruct-processor:1.5.5.Final") } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 92b03b9..92d15ba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -155,7 +155,12 @@ public class LayoutParsingPipeline { log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); - layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent()); + layoutGridService.addLayoutGrid(viewerDocumentFile, + documentGraph, + viewerDocumentFile, + false, + layoutParsingRequest.visualLayoutParsingFileId() + .isPresent()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); @@ -163,7 +168,7 @@ public class LayoutParsingPipeline { layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); - if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND)) { + if (layoutParsingRequest.researchDocumentStorageId() != null) { log.info("Building research document data for {}", layoutParsingRequest.identifier()); var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); @@ -254,7 +259,7 @@ public class LayoutParsingPipeline { OutlineObject lastProcessedOutlineObject = null; // parsing the structure elements could be useful as well - if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { + if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); } @@ -302,13 +307,9 @@ public class LayoutParsingPipeline { TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); - List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, - pdPage, - pageNumber, - cleanRulings, - stripper.getTextPositionSequences(), + List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), - false); + false); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) .addAll(graphics.stream() @@ -319,16 +320,11 @@ public class LayoutParsingPipeline { case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); - case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words, - cleanRulings, - true, - classificationDocument.getVisualizations(), - layoutParsingType); - case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, - cleanRulings, - false, - classificationDocument.getVisualizations(), - layoutParsingType); + case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> + docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType); + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> + docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType); + default -> throw new IllegalArgumentException("Unexpected LayoutParsingType: " + layoutParsingType); }; classificationPage.setCleanRulings(cleanRulings); @@ -338,7 +334,7 @@ public class LayoutParsingPipeline { classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); - if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { + if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>()); OutlineObject notFoundOutlineObject = null; @@ -387,8 +383,8 @@ public class LayoutParsingPipeline { } log.info("Classify TextBlocks for {}", identifier); switch (layoutParsingType) { - case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument( - classificationDocument); + case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> + redactManagerClassificationService.classifyDocument(classificationDocument); case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/MarkdownParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/MarkdownParsingPipeline.java new file mode 100644 index 0000000..72dbc84 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/MarkdownParsingPipeline.java @@ -0,0 +1,74 @@ +package com.knecon.fforesight.service.layoutparser.processor; + +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.commonmark.Extension; +import org.commonmark.ext.gfm.tables.TablesExtension; +import org.commonmark.node.Document; +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.Renderer; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.springframework.stereotype.Service; + +import com.iqser.red.storage.commons.service.StorageService; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; +import com.knecon.fforesight.service.layoutparser.processor.markdown.MarkdownChunker; +import com.knecon.fforesight.tenantcommons.TenantContext; + +import io.micrometer.observation.annotation.Observed; +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; + +@Service +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class MarkdownParsingPipeline { + + StorageService storageService; + + + @SneakyThrows + @Observed(name = "MarkdownParsingPipeline", contextualName = "parse-markdown") + public LayoutParsingFinishedEvent parseMarkdownAndSaveToStorage(LayoutParsingRequest request) { + + long start = System.currentTimeMillis(); + String markdown; + try (var in = storageService.getObject(TenantContext.getTenantId(), request.originFileStorageId()).getInputStream()) { + markdown = new String(in.readAllBytes(), StandardCharsets.UTF_8); + } + Parser parser = buildParser(); + Node node = parser.parse(markdown); + + MarkdownChunker chunker = new MarkdownChunker(600); + + node.accept(chunker); + + Renderer renderer = buildRenderer(); + List markdownChunks = chunker.getResult(); + for (Document markdownChunk : markdownChunks) { + + } + + return LayoutParsingFinishedEvent.builder().identifier(request.identifier()).numberOfPages(1).duration(System.currentTimeMillis() - start).build(); + } + + + public static Parser buildParser() { + + List extensions = List.of(TablesExtension.create()); + return Parser.builder().extensions(extensions).build(); + } + + + public static MarkdownRenderer buildRenderer() { + + List extensions = List.of(TablesExtension.create()); + return MarkdownRenderer.builder().extensions(extensions).build(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ExtraTokens.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ExtraTokens.java new file mode 100644 index 0000000..a6aa03d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ExtraTokens.java @@ -0,0 +1,13 @@ +package com.knecon.fforesight.service.layoutparser.processor.markdown; + +import org.commonmark.node.IndentedCodeBlock; +import org.commonmark.node.Paragraph; + +import com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter; + +public class ExtraTokens { + + public static int INDENTED_CODE_BLOCK = 10; + public static int PARAGRAPH = 10; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownChunker.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownChunker.java new file mode 100644 index 0000000..f73b265 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/MarkdownChunker.java @@ -0,0 +1,527 @@ +package com.knecon.fforesight.service.layoutparser.processor.markdown; + +import static com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline.buildRenderer; +import static com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter.countTokens; + +import java.text.BreakIterator; +import java.util.Collections; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.stream.Collectors; + +import org.commonmark.ext.gfm.tables.TableBlock; +import org.commonmark.ext.gfm.tables.TableBody; +import org.commonmark.node.AbstractVisitor; +import org.commonmark.node.BlockQuote; +import org.commonmark.node.BulletList; +import org.commonmark.node.Code; +import org.commonmark.node.CustomBlock; +import org.commonmark.node.CustomNode; +import org.commonmark.node.Document; +import org.commonmark.node.Emphasis; +import org.commonmark.node.FencedCodeBlock; +import org.commonmark.node.HardLineBreak; +import org.commonmark.node.Heading; +import org.commonmark.node.HtmlBlock; +import org.commonmark.node.HtmlInline; +import org.commonmark.node.Image; +import org.commonmark.node.IndentedCodeBlock; +import org.commonmark.node.Link; +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.node.ListBlock; +import org.commonmark.node.ListItem; +import org.commonmark.node.Node; +import org.commonmark.node.OrderedList; +import org.commonmark.node.Paragraph; +import org.commonmark.node.SoftLineBreak; +import org.commonmark.node.StrongEmphasis; +import org.commonmark.node.Text; +import org.commonmark.node.ThematicBreak; +import org.commonmark.renderer.Renderer; + +import com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter; + +import lombok.AccessLevel; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@FieldDefaults(level = AccessLevel.PRIVATE) +public class MarkdownChunker extends AbstractVisitor { + + NodeCopier nodeCopier = NodeCopier.INSTANCE; + + final int tokenLimit; + List allChunks; + + Deque currentHeadings; + Document chunk; + boolean validChunk; + + + public MarkdownChunker(int tokenLimit) { + + this.tokenLimit = tokenLimit; + allChunks = new LinkedList<>(); + currentHeadings = new LinkedList<>(); + startNewChunk(); + } + + + public List getResult() { + + for (Document chunk : allChunks) { + if (countTokens(chunk) > tokenLimit) { + throwUnsplittableNodeError(chunk); + } + } + return allChunks; + } + + + @Override + public void visit(Heading heading) { + + if (heading.getLevel() > 4) { + addToChunk(heading); + } + if (currentHeadings.isEmpty() || currentHeadings.peek().getLevel() < heading.getLevel()) { + currentHeadings.push(heading); + } else { + while (!currentHeadings.isEmpty() && currentHeadings.peek().getLevel() >= heading.getLevel()) { + currentHeadings.pop(); + } + currentHeadings.push(heading); + } + + startNewChunk(); + } + + + private void startNewChunk() { + + if (!validChunk && !allChunks.isEmpty()) { + allChunks.remove(allChunks.size() - 1); + } + validChunk = false; + chunk = buildNewChunk(); + allChunks.add(chunk); + } + + + private Document buildNewChunk() { + + Document document = new Document(); + List headingCopies = currentHeadings.stream() + .map(nodeCopier::copyNodeWithChildren) + .collect(Collectors.toList()); + Collections.reverse(headingCopies); + headingCopies.forEach(document::appendChild); + return document; + } + + + public int currentTokenCount() { + + return TokenCounter.countTokens(chunk); + } + + + public boolean fitsTokenLimit(Node node) { + + Document document = buildNewChunk(); + document.appendChild(nodeCopier.copyNodeWithChildren(node)); + return TokenCounter.countTokens(document) <= tokenLimit; + } + + + private void addToChunk(Node node) { + + chunk.appendChild(node); + + if (currentTokenCount() <= tokenLimit) { + return; + } + + node.unlink(); + startNewChunk(); + chunk.appendChild(node); + + if (currentTokenCount() > tokenLimit) { // node is too large and won't fit in tokenLimit, split is necessary + node.unlink(); + startNewChunk(); + splitNodeAndAddToChunk(node); + return; + } + + validChunk = true; + + } + + + private void splitNodeAndAddToChunk(Node node) { + + if (node instanceof TableBlock tableBlock) { + splitTable(tableBlock); + return; + } else if (node instanceof BulletList bulletList) { + splitList(bulletList); + return; + } else if (node instanceof OrderedList orderedList) { + splitList(orderedList); + return; + } else if (node instanceof Paragraph paragraph) { + splitParagraph(paragraph); + return; + } else if (node instanceof IndentedCodeBlock indentedCodeBlock) { + splitCodeBlock(indentedCodeBlock); + return; + } + throwUnsplittableNodeError(node); + } + + + private void splitCodeBlock(IndentedCodeBlock indentedCodeBlock) { + + List splitBlocks = new LinkedList<>(); + StringBuilder sb = new StringBuilder(); + BreakIterator lineIterator = BreakIterator.getLineInstance(Locale.ENGLISH); + lineIterator.setText(indentedCodeBlock.getLiteral()); + int start = lineIterator.first(); + for (int end = lineIterator.next(); end != BreakIterator.DONE; start = end, end = lineIterator.next()) { + String sentence = indentedCodeBlock.getLiteral().substring(start, end); + if (!fitsTokenLimit(buildIndentedCodeBlock(sb.toString()))) { + sb.replace(sb.length() - sentence.length(), sb.length(), ""); + IndentedCodeBlock block = buildIndentedCodeBlock(sb.toString()); + splitBlocks.add(block); + sb = new StringBuilder(); + } + sb.append(sentence); + } + if (!sb.isEmpty()) { + if (fitsTokenLimit(buildIndentedCodeBlock(sb.toString()))) { + splitBlocks.add(buildIndentedCodeBlock(sb.toString())); + } else { + int mid = sb.length() / 2; + splitBlocks.add(buildIndentedCodeBlock(sb.substring(0, mid))); + splitBlocks.add(buildIndentedCodeBlock(sb.substring(mid, sb.length()))); + } + } + + splitBlocks.forEach(this::addToChunk); + } + + + private static IndentedCodeBlock buildIndentedCodeBlock(String string) { + + IndentedCodeBlock block = new IndentedCodeBlock(); + block.setLiteral(string); + return block; + } + + + private void splitParagraph(Paragraph paragraph) { + + if (fitsTokenLimit(paragraph)) { + addToChunk(paragraph); + return; + } + + List children = collectAllChildNodes(paragraph); + + if (children.size() == 1) { + if (children.get(0) instanceof Text text) { + List splitTexts = splitText(text); + for (Text splitText : splitTexts) { + Paragraph paragraph1 = new Paragraph(); + paragraph1.appendChild(splitText); + addToChunk(paragraph1); + } + return; + } + throwUnsplittableNodeError(children.get(0)); + } + + Paragraph paragraph1 = new Paragraph(); + Paragraph paragraph2 = new Paragraph(); + + int mid = children.size() / 2; + children.subList(0, mid) + .forEach(paragraph1::appendChild); + children.subList(mid, children.size()) + .forEach(paragraph2::appendChild); + + splitParagraph(paragraph1); + splitParagraph(paragraph2); + } + + + private void throwUnsplittableNodeError(Node node) { + + Renderer renderer = buildRenderer(); + String renderedNode = renderer.render(node); + log.error(renderedNode); + throw new IllegalArgumentException(String.format("Node %s exceeds token limit (%d/%d) and can't be split!", node, countTokens(renderedNode), tokenLimit)); + } + + + private static List collectAllChildNodes(Node parent) { + + List children = new LinkedList<>(); + Node next; + for (Node child = parent.getFirstChild(); child != null; child = next) { + next = child.getNext(); + children.add(child); + } + return children; + } + + + private List splitText(Text text) { + + List splitTexts = new LinkedList<>(); + StringBuilder sb = new StringBuilder(); + BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.ENGLISH); + sentenceIterator.setText(text.getLiteral()); + int start = sentenceIterator.first(); + for (int end = sentenceIterator.next(); end != BreakIterator.DONE; start = end, end = sentenceIterator.next()) { + String sentence = text.getLiteral().substring(start, end); + if (!fitsTokenLimit(buildParagraphWithText(sb))) { + sb.replace(sb.length() - sentence.length(), sb.length(), ""); + splitTexts.add(new Text(sb.toString())); + sb = new StringBuilder(); + } + sb.append(sentence); + } + if (!sb.isEmpty()) { + if (fitsTokenLimit(buildParagraphWithText(sb))) { + splitTexts.add(new Text(sb.toString())); + } else { + int mid = sb.length() / 2; + splitTexts.add(new Text(sb.substring(0, mid))); + splitTexts.add(new Text(sb.substring(mid, sb.length()))); + } + } + return splitTexts; + } + + + private static Paragraph buildParagraphWithText(StringBuilder sb) { + + Paragraph paragraph = new Paragraph(); + paragraph.appendChild(new Text(sb.toString())); + return paragraph; + } + + + private void splitList(BulletList bulletList) { + + if (fitsTokenLimit(bulletList)) { + addToChunk(bulletList); + return; + } + + BulletList list1 = new BulletList(); + BulletList list2 = new BulletList(); + + splitLists(bulletList, list1, list2); + } + + + private void splitList(OrderedList orderedList) { + + if (fitsTokenLimit(orderedList)) { + addToChunk(orderedList); + return; + } + + OrderedList list1 = new OrderedList(); + OrderedList list2 = new OrderedList(); + + splitLists(orderedList, list1, list2); + } + + + private void splitLists(ListBlock originList, ListBlock list1, ListBlock list2) { + + List listItems = collectAllChildNodes(originList); + + if (listItems.size() == 1) { + collectAllChildNodes(listItems.get(0)).forEach(this::addToChunk); + } + + int mid = listItems.size() / 2; + listItems.subList(0, mid) + .forEach(list1::appendChild); + listItems.subList(mid, listItems.size()) + .forEach(list2::appendChild); + + splitNodeAndAddToChunk(list1); + splitNodeAndAddToChunk(list2); + } + + + private void splitTable(TableBlock tableBlock) { + + if (fitsTokenLimit(tableBlock)) { + addToChunk(tableBlock); + return; + } + + TableBlock tableBlock1 = new TableBlock(); + TableBlock tableBlock2 = new TableBlock(); + + tableBlock1.appendChild(nodeCopier.copy(tableBlock.getFirstChild())); + tableBlock2.appendChild(nodeCopier.copy(tableBlock.getFirstChild())); + + TableBody tableBody1 = new TableBody(); + TableBody tableBody2 = new TableBody(); + + List tableRows = collectAllChildNodes(tableBlock.getLastChild()); + + if (tableRows.isEmpty()) { + throw new IllegalArgumentException("The table headers already exceeds the token limit"); + } + if (tableRows.size() == 1) { + throw new IllegalArgumentException("A single table row already exceeds the token limit"); + } + + int mid = tableRows.size() / 2; + tableRows.subList(0, mid) + .forEach(tableBody1::appendChild); + tableRows.subList(mid, tableRows.size()) + .forEach(tableBody2::appendChild); + + splitTable(tableBlock1); + splitTable(tableBlock2); + } + + + public void visit(BlockQuote blockQuote) { + + this.addToChunk(blockQuote); + } + + + public void visit(BulletList bulletList) { + + this.addToChunk(bulletList); + } + + + public void visit(Code code) { + + this.addToChunk(code); + } + + + public void visit(Emphasis emphasis) { + + this.addToChunk(emphasis); + } + + + public void visit(FencedCodeBlock fencedCodeBlock) { + + this.addToChunk(fencedCodeBlock); + } + + + public void visit(HardLineBreak hardLineBreak) { + + this.addToChunk(hardLineBreak); + } + + + public void visit(ThematicBreak thematicBreak) { + + this.addToChunk(thematicBreak); + } + + + public void visit(HtmlInline htmlInline) { + + this.addToChunk(htmlInline); + } + + + public void visit(HtmlBlock htmlBlock) { + + this.addToChunk(htmlBlock); + } + + + public void visit(Image image) { + + this.addToChunk(image); + } + + + public void visit(IndentedCodeBlock indentedCodeBlock) { + + this.addToChunk(indentedCodeBlock); + } + + + public void visit(Link link) { + + this.addToChunk(link); + } + + + public void visit(ListItem listItem) { + + this.addToChunk(listItem); + } + + + public void visit(OrderedList orderedList) { + + this.addToChunk(orderedList); + } + + + public void visit(Paragraph paragraph) { + + this.addToChunk(paragraph); + } + + + public void visit(SoftLineBreak softLineBreak) { + + this.addToChunk(softLineBreak); + } + + + public void visit(StrongEmphasis strongEmphasis) { + + this.addToChunk(strongEmphasis); + } + + + public void visit(Text text) { + + this.addToChunk(text); + } + + + public void visit(LinkReferenceDefinition linkReferenceDefinition) { + + this.addToChunk(linkReferenceDefinition); + } + + + public void visit(CustomBlock customBlock) { + + this.addToChunk(customBlock); + } + + + public void visit(CustomNode customNode) { + + this.addToChunk(customNode); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/NodeCopier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/NodeCopier.java new file mode 100644 index 0000000..77877fb --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/NodeCopier.java @@ -0,0 +1,171 @@ +package com.knecon.fforesight.service.layoutparser.processor.markdown; + +import org.commonmark.ext.gfm.tables.TableBlock; +import org.commonmark.ext.gfm.tables.TableBody; +import org.commonmark.ext.gfm.tables.TableCell; +import org.commonmark.ext.gfm.tables.TableHead; +import org.commonmark.ext.gfm.tables.TableRow; +import org.commonmark.node.BlockQuote; +import org.commonmark.node.BulletList; +import org.commonmark.node.Code; +import org.commonmark.node.Document; +import org.commonmark.node.Emphasis; +import org.commonmark.node.FencedCodeBlock; +import org.commonmark.node.HardLineBreak; +import org.commonmark.node.Heading; +import org.commonmark.node.HtmlBlock; +import org.commonmark.node.HtmlInline; +import org.commonmark.node.Image; +import org.commonmark.node.IndentedCodeBlock; +import org.commonmark.node.Link; +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.node.ListItem; +import org.commonmark.node.Node; +import org.commonmark.node.OrderedList; +import org.commonmark.node.Paragraph; +import org.commonmark.node.SoftLineBreak; +import org.commonmark.node.StrongEmphasis; +import org.commonmark.node.Text; +import org.commonmark.node.ThematicBreak; +import org.mapstruct.Mapper; +import org.mapstruct.Mapping; +import org.mapstruct.factory.Mappers; + +@Mapper +public interface NodeCopier { + + NodeCopier INSTANCE = Mappers.getMapper(NodeCopier.class); + + + default Node copyNodeWithChildren(Node node) { + + Node copy = copy(node); + + Node next; + for (Node child = node.getFirstChild(); child != null; child = next) { + next = child.getNext(); + copy.appendChild(copyNodeWithChildren(child)); + } + return copy; + } + + + default Node copy(Node node) { + + return switch (node.getClass().getSimpleName()) { + case "BlockQuote" -> copy((BlockQuote) node); + case "BulletList" -> copy((BulletList) node); + case "Code" -> copy((Code) node); + case "Document" -> copy((Document) node); + case "Emphasis" -> copy((Emphasis) node); + case "FencedCodeBlock" -> copy((FencedCodeBlock) node); + case "HardLineBreak" -> copy((HardLineBreak) node); + case "Heading" -> copy((Heading) node); + case "HtmlBlock" -> copy((HtmlBlock) node); + case "HtmlInline" -> copy((HtmlInline) node); + case "Image" -> copy((Image) node); + case "IndentedCodeBlock" -> copy((IndentedCodeBlock) node); + case "Link" -> copy((Link) node); + case "LinkReferenceDefinition" -> copy((LinkReferenceDefinition) node); + case "ListItem" -> copy((ListItem) node); + case "OrderedList" -> copy((OrderedList) node); + case "Paragraph" -> copy((Paragraph) node); + case "SoftLineBreak" -> copy((SoftLineBreak) node); + case "StrongEmphasis" -> copy((StrongEmphasis) node); + case "Text" -> copy((Text) node); + case "ThematicBreak" -> copy((ThematicBreak) node); + case "TableBlock" -> copy((TableBlock) node); + case "TableBody" -> copy((TableBody) node); + case "TableCell" -> copy((TableCell) node); + case "TableHead" -> copy((TableHead) node); + case "TableRow" -> copy((TableRow) node); + default -> throw new IllegalArgumentException("No copy method found for class: " + node.getClass().getName()); + }; + } + + + BlockQuote copy(BlockQuote blockQuote); + + + @Mapping(target = "bulletMarker", ignore = true) + BulletList copy(BulletList bulletList); + + + Code copy(Code code); + + + Document copy(Document document); + + + @Mapping(target = "delimiter", source = "openingDelimiter") + Emphasis copy(Emphasis emphasis); + + + @Mapping(target = "fenceChar", ignore = true) + @Mapping(target = "fenceLength", ignore = true) + FencedCodeBlock copy(FencedCodeBlock fencedCodeBlock); + + + HardLineBreak copy(HardLineBreak hardLineBreak); + + + Heading copy(Heading heading); + + + HtmlBlock copy(HtmlBlock htmlBlock); + + + HtmlInline copy(HtmlInline htmlInline); + + + Image copy(Image image); + + + IndentedCodeBlock copy(IndentedCodeBlock indentedCodeBlock); + + + Link copy(Link link); + + + LinkReferenceDefinition copy(LinkReferenceDefinition linkReferenceDefinition); + + + ListItem copy(ListItem listItem); + + + @Mapping(target = "startNumber", ignore = true) + @Mapping(target = "delimiter", ignore = true) + OrderedList copy(OrderedList orderedList); + + + Paragraph copy(Paragraph paragraph); + + + SoftLineBreak copy(SoftLineBreak softLineBreak); + + + @Mapping(target = "delimiter", source = "openingDelimiter") + StrongEmphasis copy(StrongEmphasis strongEmphasis); + + + Text copy(Text text); + + + ThematicBreak copy(ThematicBreak thematicBreak); + + + TableBlock copy(TableBlock tableBlock); + + + TableBody copy(TableBody tableBody); + + + TableCell copy(TableCell tableCell); + + + TableHead copy(TableHead tableHead); + + + TableRow copy(TableRow tableRow); + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ReflectionNodeCopier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ReflectionNodeCopier.java new file mode 100644 index 0000000..5fac185 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/ReflectionNodeCopier.java @@ -0,0 +1,65 @@ +package com.knecon.fforesight.service.layoutparser.processor.markdown; + +import java.lang.reflect.Field; + +import org.commonmark.node.Node; + +public class ReflectionNodeCopier { + + NodeCopier mapperNodeCopier; + + + + public static Node copyNode(Node node) { + + Node copy = deepCopy(node); + copyChildren(node, copy); + return copy; + } + + + private static void copyChildren(Node nodeToCopy, Node copy) { + + Node next; + for (Node node = nodeToCopy.getFirstChild(); node != null; node = next) { + next = node.getNext(); + copy.appendChild(copyNode(node)); + } + } + + + private static T deepCopy(T object) { + + try { + Class clazz = object.getClass(); + T copy = (T) clazz.getDeclaredConstructor().newInstance(); + + for (Field field : clazz.getDeclaredFields()) { + field.setAccessible(true); + Object value = field.get(object); + if (isPrimitiveOrWrapper(field.getType()) || field.getType().equals(String.class)) { + field.set(copy, value); + } + } + return copy; + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + + private static boolean isPrimitiveOrWrapper(Class type) { + + return type.isPrimitive() + || type == Boolean.class + || type == Byte.class + || type == Character.class + || type == Double.class + || type == Float.class + || type == Integer.class + || type == Long.class + || type == Short.class; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java index a3d7917..f8548e9 100755 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java @@ -7,6 +7,8 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import com.google.common.base.Functions; + import lombok.Getter; @Getter @@ -59,7 +61,9 @@ public class FloatFrequencyCounter { } } - return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList()); + return higher.stream() + .sorted(Collections.reverseOrder()) + .collect(Collectors.toList()); } @@ -74,4 +78,16 @@ public class FloatFrequencyCounter { return highest; } + + public double getAverage() { + + double sum = countPerValue.keySet() + .stream() + .mapToDouble(fontSize -> fontSize * countPerValue.get(fontSize)).sum(); + double count = countPerValue.values() + .stream() + .mapToInt(Integer::intValue).sum(); + return sum / count; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index f82d3fa..8630a73 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -60,8 +60,8 @@ public class Document extends AbstractSemanticNode { * * @return A list of main sections within the document * @deprecated This method is marked for removal. - * Use {@link #streamChildrenOfType(NodeType)} instead, - * or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION. + * Use {@link #streamChildrenOfType(NodeType)} instead, + * or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION. */ @Deprecated(forRemoval = true) public List
getMainSections() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java index 32369e6..f26289f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java @@ -42,7 +42,9 @@ public interface SemanticNode { */ default TextBlock getTextBlock() { - return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector()); + return streamAllSubNodes().filter(SemanticNode::isLeaf) + .map(SemanticNode::getTextBlock) + .collect(new TextBlockCollector()); } @@ -68,7 +70,10 @@ public interface SemanticNode { default Page getFirstPage() { - return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); + return getTextBlock().getPages() + .stream() + .min(Comparator.comparingInt(Page::getNumber)) + .orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); } @@ -88,7 +93,8 @@ public interface SemanticNode { default boolean isOnPage(int pageNumber) { - return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber); + return getPages().stream() + .anyMatch(page -> page.getNumber() == pageNumber); } @@ -203,7 +209,9 @@ public interface SemanticNode { */ default boolean hasEntitiesOfType(String type) { - return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type)); + return getEntities().stream() + .filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)) + .anyMatch(redactionEntity -> redactionEntity.getType().equals(type)); } @@ -215,7 +223,9 @@ public interface SemanticNode { */ default List getEntitiesOfType(String type) { - return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList(); + return getEntities().stream() + .filter(redactionEntity -> redactionEntity.getType().equals(type)) + .toList(); } @@ -227,7 +237,9 @@ public interface SemanticNode { */ default List getEntitiesOfType(List types) { - return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList(); + return getEntities().stream() + .filter(redactionEntity -> redactionEntity.isAnyType(types)) + .toList(); } @@ -241,7 +253,8 @@ public interface SemanticNode { TextBlock textBlock = getTextBlock(); if (!textBlock.getAtomicTextBlocks().isEmpty()) { - return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage(); + return getTextBlock().getAtomicTextBlocks() + .get(0).getNumberOnPage(); } else { return -1; } @@ -279,7 +292,8 @@ public interface SemanticNode { */ default boolean containsStrings(List strings) { - return strings.stream().allMatch(this::containsString); + return strings.stream() + .allMatch(this::containsString); } @@ -303,7 +317,8 @@ public interface SemanticNode { */ default boolean containsAnyString(List strings) { - return strings.stream().anyMatch(this::containsString); + return strings.stream() + .anyMatch(this::containsString); } @@ -315,7 +330,8 @@ public interface SemanticNode { */ default boolean containsAnyStringIgnoreCase(List strings) { - return strings.stream().anyMatch(this::containsStringIgnoreCase); + return strings.stream() + .anyMatch(this::containsStringIgnoreCase); } @@ -386,7 +402,8 @@ public interface SemanticNode { */ default Stream streamAllSubNodes() { - return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode); + return getDocumentTree().allSubEntriesInOrder(getTreeId()) + .map(DocumentTree.Entry::getNode); } @@ -397,7 +414,9 @@ public interface SemanticNode { */ default Stream streamAllSubNodesOfType(NodeType nodeType) { - return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode); + return getDocumentTree().allSubEntriesInOrder(getTreeId()) + .filter(entry -> entry.getType().equals(nodeType)) + .map(DocumentTree.Entry::getNode); } @@ -454,8 +473,16 @@ public interface SemanticNode { private Map getBBoxFromChildren() { Map bBoxPerPage = new HashMap<>(); - List> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList(); - Set pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet()); + + List> childrenBBoxes = streamChildren() // + .filter(SemanticNode::isNotOcrImage) + .map(SemanticNode::getBBox) + .toList(); + + Set pages = childrenBBoxes.stream() + .flatMap(map -> map.keySet() + .stream()) + .collect(Collectors.toSet()); for (Page page : pages) { Rectangle2D bBoxOnPage = childrenBBoxes.stream() .filter(childBboxPerPage -> childBboxPerPage.containsKey(page)) @@ -467,13 +494,24 @@ public interface SemanticNode { } + private static boolean isNotOcrImage(SemanticNode node) { + + if (!node.getType().equals(NodeType.IMAGE)) { + return true; + } + return false; + } + + /** * @return The union of all BoundingBoxes of the TextBlock of this node */ private Map getBBoxFromLeafTextBlock() { Map bBoxPerPage = new HashMap<>(); - Map> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage)); + Map> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks() + .stream() + .collect(Collectors.groupingBy(AtomicTextBlock::getPage)); atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs))); return bBoxPerPage; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java index 33d9427..8d393fd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java @@ -109,10 +109,7 @@ public class AtomicTextBlock implements TextBlock { } - public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, - DocumentPositionData documentPositionData, - SemanticNode parent, - Page page) { + public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) { return AtomicTextBlock.builder() .id(documentTextData.getId()) @@ -120,8 +117,10 @@ public class AtomicTextBlock implements TextBlock { .page(page) .boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd())) .searchText(documentTextData.getSearchText()) - .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList()) - .stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList()) + .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed() + .toList()) + .stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed() + .toList()) .positions(toRectangle2DList(documentPositionData.getPositions())) .parent(parent) .build(); @@ -130,7 +129,9 @@ public class AtomicTextBlock implements TextBlock { private static List toRectangle2DList(float[][] positions) { - return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList(); + return Arrays.stream(positions) + .map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])) + .toList(); } @@ -159,9 +160,9 @@ public class AtomicTextBlock implements TextBlock { public int getNextLinebreak(int fromIndex) { return lineBreaks.stream()// - .filter(linebreak -> linebreak > fromIndex - boundary.start()) // - .findFirst() // - .orElse(searchText.length()) + boundary.start(); + .filter(linebreak -> linebreak > fromIndex - boundary.start()) // + .findFirst() // + .orElse(searchText.length()) + boundary.start(); } @@ -169,9 +170,9 @@ public class AtomicTextBlock implements TextBlock { public int getPreviousLinebreak(int fromIndex) { return lineBreaks.stream()// - .filter(linebreak -> linebreak <= fromIndex - boundary.start())// - .reduce((a, b) -> b)// - .orElse(0) + boundary.start(); + .filter(linebreak -> linebreak <= fromIndex - boundary.start())// + .reduce((a, b) -> b)// + .orElse(0) + boundary.start(); } @@ -219,7 +220,10 @@ public class AtomicTextBlock implements TextBlock { private List getAllLineBreaksInBoundary(Boundary boundary) { - return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList(); + return getLineBreaks().stream() + .map(linebreak -> linebreak + this.boundary.start()) + .filter(boundary::contains) + .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TokenCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TokenCounter.java new file mode 100644 index 0000000..f9203c1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TokenCounter.java @@ -0,0 +1,30 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import static com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline.buildRenderer; + +import org.commonmark.node.Node; +import org.commonmark.renderer.markdown.MarkdownRenderer; + +import com.didalgo.gpt3.Encoding; +import com.didalgo.gpt3.GPT3Tokenizer; +import com.didalgo.gpt3.TokenCount; + +public class TokenCounter { + + private static final GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE); + + + public static int countTokens(Node node) { + + MarkdownRenderer renderer = buildRenderer(); + String markdownResult = renderer.render(node); + return countTokens(markdownResult); + } + + + public static synchronized int countTokens(String text) { + + return TokenCount.fromString(text, tokenizer); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java index e4c4d8f..b10c15e 100644 --- a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java +++ b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java @@ -10,6 +10,7 @@ import org.springframework.amqp.core.Message; import org.springframework.amqp.rabbit.annotation.RabbitHandler; import org.springframework.amqp.rabbit.annotation.RabbitListener; import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.boot.actuate.logging.LogFileWebEndpoint; import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; @@ -18,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; +import com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -29,9 +31,11 @@ import lombok.extern.slf4j.Slf4j; public class MessageHandler { private final LayoutParsingPipeline layoutParsingPipeline; + private final MarkdownParsingPipeline markdownParsingPipeline; private final ObjectMapper objectMapper; private final RabbitTemplate rabbitTemplate; private final static String X_PIPELINE_PREFIX = "X-PIPE-"; + private final LogFileWebEndpoint logFileWebEndpoint; @RabbitHandler @@ -41,30 +45,30 @@ public class MessageHandler { LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class); - if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND) && layoutParsingRequest.researchDocumentStorageId() == null) { - throw new IllegalArgumentException("ResearchDocumentDataStorageId is null!"); - } log.info("Layout parsing request received {}", layoutParsingRequest.identifier()); if (message.getMessageProperties().isRedelivered()) { throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.", - layoutParsingRequest.identifier())); + layoutParsingRequest.identifier())); + } + LayoutParsingFinishedEvent layoutParsingFinishedEvent; + if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.MARKDOWN)) { + layoutParsingFinishedEvent = markdownParsingPipeline.parseMarkdownAndSaveToStorage(layoutParsingRequest); + } else { + layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); } - LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent, message); } public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent, Message message) { - Arrays.stream(layoutParsingFinishedEvent.message().split("\n")).forEach(log::info); + Arrays.stream(layoutParsingFinishedEvent.message().split("\n")) + .forEach(log::info); rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent, m -> { - var forwardHeaders = message.getMessageProperties() - .getHeaders() - .entrySet() + var forwardHeaders = message.getMessageProperties().getHeaders().entrySet() .stream() .filter(e -> e.getKey().toUpperCase(Locale.ROOT).startsWith(X_PIPELINE_PREFIX)) - .collect(Collectors.toMap(Map.Entry::getKey, - Map.Entry::getValue)); + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); m.getMessageProperties().getHeaders().putAll(forwardHeaders); return m; }); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index d73fc14..75e62c9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @SneakyThrows public void testLayoutParserEndToEndWithFolder() { - String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files"; + String folder = "/home/kschuettler/Dokumente/TestFiles/syn-dm-single-digit-headlines"; List pdfFiles = Files.walk(Path.of(folder)) .filter(path -> path.getFileName().toString().endsWith(".pdf")) .sorted(Comparator.comparing(Path::getFileName)) @@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { file = new File(filePath); } - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.DOCUMINE_OLD, true); prepareStorage(layoutParsingRequest, file); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/MarkdownParsingPipelineTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/MarkdownParsingPipelineTest.java new file mode 100644 index 0000000..da28060 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/MarkdownParsingPipelineTest.java @@ -0,0 +1,54 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import java.io.FileInputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.commons.jackson.ObjectMapperFactory; +import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline; +import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; + +import lombok.AccessLevel; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; + +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class MarkdownParsingPipelineTest { + + static String TENANT = "tenant"; + ObjectMapper mapper = ObjectMapperFactory.create(); + FileSystemBackedStorageService storageService = new FileSystemBackedStorageService(mapper); + MarkdownParsingPipeline markdownParsingPipeline = new MarkdownParsingPipeline(storageService); + + + @Test + @SneakyThrows + public void parseMarkdownsFromFolder() { + + Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/confluence_dump/"); + Files.walk(file) + .filter(path -> path.getFileName().toFile().toString().endsWith(".md")) + .peek(System.out::println) + .forEach(this::parseMarkdown); + } + + + @SneakyThrows + public void parseMarkdown(Path file) { + + LayoutParsingRequest layoutParsingRequest = AbstractTest.buildDefaultLayoutParsingRequest(file.getFileName().toFile().toString(), LayoutParsingType.MARKDOWN, true); + + try (var in = new FileInputStream(file.toFile())) { + storageService.storeObject(TENANT, layoutParsingRequest.originFileStorageId(), in); + } + + markdownParsingPipeline.parseMarkdownAndSaveToStorage(layoutParsingRequest); + } + +} diff --git a/publish-custom-image.sh b/publish-custom-image.sh index e2191d7..f9fe6f2 100755 --- a/publish-custom-image.sh +++ b/publish-custom-image.sh @@ -1,5 +1,9 @@ #!/bin/bash + +set -e + dir=${PWD##*/} + gradle assemble # Get the current Git branch @@ -11,5 +15,32 @@ commit_hash=$(git rev-parse --short=5 HEAD) # Combine branch and commit hash buildName="${USER}-${branch}-${commit_hash}" -gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache -echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName" +gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName} + +newImageName="nexus.knecon.com:5001/ff/layoutparser-service-server:${buildName}" + +echo "full image name:" +echo ${newImageName} +echo "" + +if [ -z "$1" ]; then + exit 0 +fi + +namespace=${1} +deployment_name="layoutparser-service" + +echo "deploying to ${namespace}" + +oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}') + +if [ "${newImageName}" = "${oldImageName}" ]; then + echo "Image tag of ${deployment_name} did not change, redeploying..." + rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace} +else + echo "upgrading the image tag of ${deployment_name}..." + rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace} +fi + +rancher kubectl rollout status deployment ${deployment_name} -n ${namespace} +echo "Deployed ${deployment_name}:${buildName} to ${namespace}"