CLARI-002: markdown chunking prototype
This commit is contained in:
parent
b216f02e15
commit
4fb0de82ec
@ -0,0 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.clarifynd;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class IndexData {
|
||||
|
||||
Map<String, String> identifier;
|
||||
List<TextChunk> textChunks;
|
||||
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.clarifynd;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class TextChunk {
|
||||
|
||||
String text;
|
||||
}
|
||||
@ -8,5 +8,6 @@ public enum LayoutParsingType {
|
||||
DOCUMINE,
|
||||
DOCUMINE_OLD,
|
||||
CLARIFYND,
|
||||
CLARIFYND_PARAGRAPH_DEBUG
|
||||
CLARIFYND_PARAGRAPH_DEBUG,
|
||||
MARKDOWN
|
||||
}
|
||||
|
||||
@ -26,4 +26,10 @@ dependencies {
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
||||
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
||||
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
||||
implementation("org.commonmark:commonmark:0.22.0")
|
||||
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||
implementation("com.didalgo:gpt3-tokenizer:0.1.8")
|
||||
|
||||
implementation("org.mapstruct:mapstruct:1.5.5.Final")
|
||||
annotationProcessor("org.mapstruct:mapstruct-processor:1.5.5.Final")
|
||||
}
|
||||
|
||||
@ -155,7 +155,12 @@ public class LayoutParsingPipeline {
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile,
|
||||
documentGraph,
|
||||
viewerDocumentFile,
|
||||
false,
|
||||
layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.isPresent());
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
@ -163,7 +168,7 @@ public class LayoutParsingPipeline {
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND)) {
|
||||
if (layoutParsingRequest.researchDocumentStorageId() != null) {
|
||||
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
@ -254,7 +259,7 @@ public class LayoutParsingPipeline {
|
||||
OutlineObject lastProcessedOutlineObject = null;
|
||||
|
||||
// parsing the structure elements could be useful as well
|
||||
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
}
|
||||
|
||||
@ -302,13 +307,9 @@ public class LayoutParsingPipeline {
|
||||
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
||||
pdPage,
|
||||
pageNumber,
|
||||
cleanRulings,
|
||||
stripper.getTextPositionSequences(),
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(),
|
||||
|
||||
false);
|
||||
false);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
@ -319,16 +320,11 @@ public class LayoutParsingPipeline {
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words,
|
||||
cleanRulings,
|
||||
true,
|
||||
classificationDocument.getVisualizations(),
|
||||
layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words,
|
||||
cleanRulings,
|
||||
false,
|
||||
classificationDocument.getVisualizations(),
|
||||
layoutParsingType);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
default -> throw new IllegalArgumentException("Unexpected LayoutParsingType: " + layoutParsingType);
|
||||
};
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
@ -338,7 +334,7 @@ public class LayoutParsingPipeline {
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||
|
||||
OutlineObject notFoundOutlineObject = null;
|
||||
@ -387,8 +383,8 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument(
|
||||
classificationDocument);
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
@ -0,0 +1,74 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import org.commonmark.Extension;
|
||||
import org.commonmark.ext.gfm.tables.TablesExtension;
|
||||
import org.commonmark.node.Document;
|
||||
import org.commonmark.node.Node;
|
||||
import org.commonmark.parser.Parser;
|
||||
import org.commonmark.renderer.Renderer;
|
||||
import org.commonmark.renderer.markdown.MarkdownRenderer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.markdown.MarkdownChunker;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class MarkdownParsingPipeline {
|
||||
|
||||
StorageService storageService;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "MarkdownParsingPipeline", contextualName = "parse-markdown")
|
||||
public LayoutParsingFinishedEvent parseMarkdownAndSaveToStorage(LayoutParsingRequest request) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
String markdown;
|
||||
try (var in = storageService.getObject(TenantContext.getTenantId(), request.originFileStorageId()).getInputStream()) {
|
||||
markdown = new String(in.readAllBytes(), StandardCharsets.UTF_8);
|
||||
}
|
||||
Parser parser = buildParser();
|
||||
Node node = parser.parse(markdown);
|
||||
|
||||
MarkdownChunker chunker = new MarkdownChunker(600);
|
||||
|
||||
node.accept(chunker);
|
||||
|
||||
Renderer renderer = buildRenderer();
|
||||
List<Document> markdownChunks = chunker.getResult();
|
||||
for (Document markdownChunk : markdownChunks) {
|
||||
|
||||
}
|
||||
|
||||
return LayoutParsingFinishedEvent.builder().identifier(request.identifier()).numberOfPages(1).duration(System.currentTimeMillis() - start).build();
|
||||
}
|
||||
|
||||
|
||||
public static Parser buildParser() {
|
||||
|
||||
List<Extension> extensions = List.of(TablesExtension.create());
|
||||
return Parser.builder().extensions(extensions).build();
|
||||
}
|
||||
|
||||
|
||||
public static MarkdownRenderer buildRenderer() {
|
||||
|
||||
List<Extension> extensions = List.of(TablesExtension.create());
|
||||
return MarkdownRenderer.builder().extensions(extensions).build();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||
|
||||
import org.commonmark.node.IndentedCodeBlock;
|
||||
import org.commonmark.node.Paragraph;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter;
|
||||
|
||||
public class ExtraTokens {
|
||||
|
||||
public static int INDENTED_CODE_BLOCK = 10;
|
||||
public static int PARAGRAPH = 10;
|
||||
|
||||
}
|
||||
@ -0,0 +1,527 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline.buildRenderer;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter.countTokens;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Collections;
|
||||
import java.util.Deque;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.commonmark.ext.gfm.tables.TableBlock;
|
||||
import org.commonmark.ext.gfm.tables.TableBody;
|
||||
import org.commonmark.node.AbstractVisitor;
|
||||
import org.commonmark.node.BlockQuote;
|
||||
import org.commonmark.node.BulletList;
|
||||
import org.commonmark.node.Code;
|
||||
import org.commonmark.node.CustomBlock;
|
||||
import org.commonmark.node.CustomNode;
|
||||
import org.commonmark.node.Document;
|
||||
import org.commonmark.node.Emphasis;
|
||||
import org.commonmark.node.FencedCodeBlock;
|
||||
import org.commonmark.node.HardLineBreak;
|
||||
import org.commonmark.node.Heading;
|
||||
import org.commonmark.node.HtmlBlock;
|
||||
import org.commonmark.node.HtmlInline;
|
||||
import org.commonmark.node.Image;
|
||||
import org.commonmark.node.IndentedCodeBlock;
|
||||
import org.commonmark.node.Link;
|
||||
import org.commonmark.node.LinkReferenceDefinition;
|
||||
import org.commonmark.node.ListBlock;
|
||||
import org.commonmark.node.ListItem;
|
||||
import org.commonmark.node.Node;
|
||||
import org.commonmark.node.OrderedList;
|
||||
import org.commonmark.node.Paragraph;
|
||||
import org.commonmark.node.SoftLineBreak;
|
||||
import org.commonmark.node.StrongEmphasis;
|
||||
import org.commonmark.node.Text;
|
||||
import org.commonmark.node.ThematicBreak;
|
||||
import org.commonmark.renderer.Renderer;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TokenCounter;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class MarkdownChunker extends AbstractVisitor {
|
||||
|
||||
NodeCopier nodeCopier = NodeCopier.INSTANCE;
|
||||
|
||||
final int tokenLimit;
|
||||
List<Document> allChunks;
|
||||
|
||||
Deque<Heading> currentHeadings;
|
||||
Document chunk;
|
||||
boolean validChunk;
|
||||
|
||||
|
||||
public MarkdownChunker(int tokenLimit) {
|
||||
|
||||
this.tokenLimit = tokenLimit;
|
||||
allChunks = new LinkedList<>();
|
||||
currentHeadings = new LinkedList<>();
|
||||
startNewChunk();
|
||||
}
|
||||
|
||||
|
||||
public List<Document> getResult() {
|
||||
|
||||
for (Document chunk : allChunks) {
|
||||
if (countTokens(chunk) > tokenLimit) {
|
||||
throwUnsplittableNodeError(chunk);
|
||||
}
|
||||
}
|
||||
return allChunks;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visit(Heading heading) {
|
||||
|
||||
if (heading.getLevel() > 4) {
|
||||
addToChunk(heading);
|
||||
}
|
||||
if (currentHeadings.isEmpty() || currentHeadings.peek().getLevel() < heading.getLevel()) {
|
||||
currentHeadings.push(heading);
|
||||
} else {
|
||||
while (!currentHeadings.isEmpty() && currentHeadings.peek().getLevel() >= heading.getLevel()) {
|
||||
currentHeadings.pop();
|
||||
}
|
||||
currentHeadings.push(heading);
|
||||
}
|
||||
|
||||
startNewChunk();
|
||||
}
|
||||
|
||||
|
||||
private void startNewChunk() {
|
||||
|
||||
if (!validChunk && !allChunks.isEmpty()) {
|
||||
allChunks.remove(allChunks.size() - 1);
|
||||
}
|
||||
validChunk = false;
|
||||
chunk = buildNewChunk();
|
||||
allChunks.add(chunk);
|
||||
}
|
||||
|
||||
|
||||
private Document buildNewChunk() {
|
||||
|
||||
Document document = new Document();
|
||||
List<Node> headingCopies = currentHeadings.stream()
|
||||
.map(nodeCopier::copyNodeWithChildren)
|
||||
.collect(Collectors.toList());
|
||||
Collections.reverse(headingCopies);
|
||||
headingCopies.forEach(document::appendChild);
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
public int currentTokenCount() {
|
||||
|
||||
return TokenCounter.countTokens(chunk);
|
||||
}
|
||||
|
||||
|
||||
public boolean fitsTokenLimit(Node node) {
|
||||
|
||||
Document document = buildNewChunk();
|
||||
document.appendChild(nodeCopier.copyNodeWithChildren(node));
|
||||
return TokenCounter.countTokens(document) <= tokenLimit;
|
||||
}
|
||||
|
||||
|
||||
private void addToChunk(Node node) {
|
||||
|
||||
chunk.appendChild(node);
|
||||
|
||||
if (currentTokenCount() <= tokenLimit) {
|
||||
return;
|
||||
}
|
||||
|
||||
node.unlink();
|
||||
startNewChunk();
|
||||
chunk.appendChild(node);
|
||||
|
||||
if (currentTokenCount() > tokenLimit) { // node is too large and won't fit in tokenLimit, split is necessary
|
||||
node.unlink();
|
||||
startNewChunk();
|
||||
splitNodeAndAddToChunk(node);
|
||||
return;
|
||||
}
|
||||
|
||||
validChunk = true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void splitNodeAndAddToChunk(Node node) {
|
||||
|
||||
if (node instanceof TableBlock tableBlock) {
|
||||
splitTable(tableBlock);
|
||||
return;
|
||||
} else if (node instanceof BulletList bulletList) {
|
||||
splitList(bulletList);
|
||||
return;
|
||||
} else if (node instanceof OrderedList orderedList) {
|
||||
splitList(orderedList);
|
||||
return;
|
||||
} else if (node instanceof Paragraph paragraph) {
|
||||
splitParagraph(paragraph);
|
||||
return;
|
||||
} else if (node instanceof IndentedCodeBlock indentedCodeBlock) {
|
||||
splitCodeBlock(indentedCodeBlock);
|
||||
return;
|
||||
}
|
||||
throwUnsplittableNodeError(node);
|
||||
}
|
||||
|
||||
|
||||
private void splitCodeBlock(IndentedCodeBlock indentedCodeBlock) {
|
||||
|
||||
List<IndentedCodeBlock> splitBlocks = new LinkedList<>();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
BreakIterator lineIterator = BreakIterator.getLineInstance(Locale.ENGLISH);
|
||||
lineIterator.setText(indentedCodeBlock.getLiteral());
|
||||
int start = lineIterator.first();
|
||||
for (int end = lineIterator.next(); end != BreakIterator.DONE; start = end, end = lineIterator.next()) {
|
||||
String sentence = indentedCodeBlock.getLiteral().substring(start, end);
|
||||
if (!fitsTokenLimit(buildIndentedCodeBlock(sb.toString()))) {
|
||||
sb.replace(sb.length() - sentence.length(), sb.length(), "");
|
||||
IndentedCodeBlock block = buildIndentedCodeBlock(sb.toString());
|
||||
splitBlocks.add(block);
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
sb.append(sentence);
|
||||
}
|
||||
if (!sb.isEmpty()) {
|
||||
if (fitsTokenLimit(buildIndentedCodeBlock(sb.toString()))) {
|
||||
splitBlocks.add(buildIndentedCodeBlock(sb.toString()));
|
||||
} else {
|
||||
int mid = sb.length() / 2;
|
||||
splitBlocks.add(buildIndentedCodeBlock(sb.substring(0, mid)));
|
||||
splitBlocks.add(buildIndentedCodeBlock(sb.substring(mid, sb.length())));
|
||||
}
|
||||
}
|
||||
|
||||
splitBlocks.forEach(this::addToChunk);
|
||||
}
|
||||
|
||||
|
||||
private static IndentedCodeBlock buildIndentedCodeBlock(String string) {
|
||||
|
||||
IndentedCodeBlock block = new IndentedCodeBlock();
|
||||
block.setLiteral(string);
|
||||
return block;
|
||||
}
|
||||
|
||||
|
||||
private void splitParagraph(Paragraph paragraph) {
|
||||
|
||||
if (fitsTokenLimit(paragraph)) {
|
||||
addToChunk(paragraph);
|
||||
return;
|
||||
}
|
||||
|
||||
List<Node> children = collectAllChildNodes(paragraph);
|
||||
|
||||
if (children.size() == 1) {
|
||||
if (children.get(0) instanceof Text text) {
|
||||
List<Text> splitTexts = splitText(text);
|
||||
for (Text splitText : splitTexts) {
|
||||
Paragraph paragraph1 = new Paragraph();
|
||||
paragraph1.appendChild(splitText);
|
||||
addToChunk(paragraph1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
throwUnsplittableNodeError(children.get(0));
|
||||
}
|
||||
|
||||
Paragraph paragraph1 = new Paragraph();
|
||||
Paragraph paragraph2 = new Paragraph();
|
||||
|
||||
int mid = children.size() / 2;
|
||||
children.subList(0, mid)
|
||||
.forEach(paragraph1::appendChild);
|
||||
children.subList(mid, children.size())
|
||||
.forEach(paragraph2::appendChild);
|
||||
|
||||
splitParagraph(paragraph1);
|
||||
splitParagraph(paragraph2);
|
||||
}
|
||||
|
||||
|
||||
private void throwUnsplittableNodeError(Node node) {
|
||||
|
||||
Renderer renderer = buildRenderer();
|
||||
String renderedNode = renderer.render(node);
|
||||
log.error(renderedNode);
|
||||
throw new IllegalArgumentException(String.format("Node %s exceeds token limit (%d/%d) and can't be split!", node, countTokens(renderedNode), tokenLimit));
|
||||
}
|
||||
|
||||
|
||||
private static List<Node> collectAllChildNodes(Node parent) {
|
||||
|
||||
List<Node> children = new LinkedList<>();
|
||||
Node next;
|
||||
for (Node child = parent.getFirstChild(); child != null; child = next) {
|
||||
next = child.getNext();
|
||||
children.add(child);
|
||||
}
|
||||
return children;
|
||||
}
|
||||
|
||||
|
||||
private List<Text> splitText(Text text) {
|
||||
|
||||
List<Text> splitTexts = new LinkedList<>();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.ENGLISH);
|
||||
sentenceIterator.setText(text.getLiteral());
|
||||
int start = sentenceIterator.first();
|
||||
for (int end = sentenceIterator.next(); end != BreakIterator.DONE; start = end, end = sentenceIterator.next()) {
|
||||
String sentence = text.getLiteral().substring(start, end);
|
||||
if (!fitsTokenLimit(buildParagraphWithText(sb))) {
|
||||
sb.replace(sb.length() - sentence.length(), sb.length(), "");
|
||||
splitTexts.add(new Text(sb.toString()));
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
sb.append(sentence);
|
||||
}
|
||||
if (!sb.isEmpty()) {
|
||||
if (fitsTokenLimit(buildParagraphWithText(sb))) {
|
||||
splitTexts.add(new Text(sb.toString()));
|
||||
} else {
|
||||
int mid = sb.length() / 2;
|
||||
splitTexts.add(new Text(sb.substring(0, mid)));
|
||||
splitTexts.add(new Text(sb.substring(mid, sb.length())));
|
||||
}
|
||||
}
|
||||
return splitTexts;
|
||||
}
|
||||
|
||||
|
||||
private static Paragraph buildParagraphWithText(StringBuilder sb) {
|
||||
|
||||
Paragraph paragraph = new Paragraph();
|
||||
paragraph.appendChild(new Text(sb.toString()));
|
||||
return paragraph;
|
||||
}
|
||||
|
||||
|
||||
private void splitList(BulletList bulletList) {
|
||||
|
||||
if (fitsTokenLimit(bulletList)) {
|
||||
addToChunk(bulletList);
|
||||
return;
|
||||
}
|
||||
|
||||
BulletList list1 = new BulletList();
|
||||
BulletList list2 = new BulletList();
|
||||
|
||||
splitLists(bulletList, list1, list2);
|
||||
}
|
||||
|
||||
|
||||
private void splitList(OrderedList orderedList) {
|
||||
|
||||
if (fitsTokenLimit(orderedList)) {
|
||||
addToChunk(orderedList);
|
||||
return;
|
||||
}
|
||||
|
||||
OrderedList list1 = new OrderedList();
|
||||
OrderedList list2 = new OrderedList();
|
||||
|
||||
splitLists(orderedList, list1, list2);
|
||||
}
|
||||
|
||||
|
||||
private void splitLists(ListBlock originList, ListBlock list1, ListBlock list2) {
|
||||
|
||||
List<Node> listItems = collectAllChildNodes(originList);
|
||||
|
||||
if (listItems.size() == 1) {
|
||||
collectAllChildNodes(listItems.get(0)).forEach(this::addToChunk);
|
||||
}
|
||||
|
||||
int mid = listItems.size() / 2;
|
||||
listItems.subList(0, mid)
|
||||
.forEach(list1::appendChild);
|
||||
listItems.subList(mid, listItems.size())
|
||||
.forEach(list2::appendChild);
|
||||
|
||||
splitNodeAndAddToChunk(list1);
|
||||
splitNodeAndAddToChunk(list2);
|
||||
}
|
||||
|
||||
|
||||
private void splitTable(TableBlock tableBlock) {
|
||||
|
||||
if (fitsTokenLimit(tableBlock)) {
|
||||
addToChunk(tableBlock);
|
||||
return;
|
||||
}
|
||||
|
||||
TableBlock tableBlock1 = new TableBlock();
|
||||
TableBlock tableBlock2 = new TableBlock();
|
||||
|
||||
tableBlock1.appendChild(nodeCopier.copy(tableBlock.getFirstChild()));
|
||||
tableBlock2.appendChild(nodeCopier.copy(tableBlock.getFirstChild()));
|
||||
|
||||
TableBody tableBody1 = new TableBody();
|
||||
TableBody tableBody2 = new TableBody();
|
||||
|
||||
List<Node> tableRows = collectAllChildNodes(tableBlock.getLastChild());
|
||||
|
||||
if (tableRows.isEmpty()) {
|
||||
throw new IllegalArgumentException("The table headers already exceeds the token limit");
|
||||
}
|
||||
if (tableRows.size() == 1) {
|
||||
throw new IllegalArgumentException("A single table row already exceeds the token limit");
|
||||
}
|
||||
|
||||
int mid = tableRows.size() / 2;
|
||||
tableRows.subList(0, mid)
|
||||
.forEach(tableBody1::appendChild);
|
||||
tableRows.subList(mid, tableRows.size())
|
||||
.forEach(tableBody2::appendChild);
|
||||
|
||||
splitTable(tableBlock1);
|
||||
splitTable(tableBlock2);
|
||||
}
|
||||
|
||||
|
||||
public void visit(BlockQuote blockQuote) {
|
||||
|
||||
this.addToChunk(blockQuote);
|
||||
}
|
||||
|
||||
|
||||
public void visit(BulletList bulletList) {
|
||||
|
||||
this.addToChunk(bulletList);
|
||||
}
|
||||
|
||||
|
||||
public void visit(Code code) {
|
||||
|
||||
this.addToChunk(code);
|
||||
}
|
||||
|
||||
|
||||
public void visit(Emphasis emphasis) {
|
||||
|
||||
this.addToChunk(emphasis);
|
||||
}
|
||||
|
||||
|
||||
public void visit(FencedCodeBlock fencedCodeBlock) {
|
||||
|
||||
this.addToChunk(fencedCodeBlock);
|
||||
}
|
||||
|
||||
|
||||
public void visit(HardLineBreak hardLineBreak) {
|
||||
|
||||
this.addToChunk(hardLineBreak);
|
||||
}
|
||||
|
||||
|
||||
public void visit(ThematicBreak thematicBreak) {
|
||||
|
||||
this.addToChunk(thematicBreak);
|
||||
}
|
||||
|
||||
|
||||
public void visit(HtmlInline htmlInline) {
|
||||
|
||||
this.addToChunk(htmlInline);
|
||||
}
|
||||
|
||||
|
||||
public void visit(HtmlBlock htmlBlock) {
|
||||
|
||||
this.addToChunk(htmlBlock);
|
||||
}
|
||||
|
||||
|
||||
public void visit(Image image) {
|
||||
|
||||
this.addToChunk(image);
|
||||
}
|
||||
|
||||
|
||||
public void visit(IndentedCodeBlock indentedCodeBlock) {
|
||||
|
||||
this.addToChunk(indentedCodeBlock);
|
||||
}
|
||||
|
||||
|
||||
public void visit(Link link) {
|
||||
|
||||
this.addToChunk(link);
|
||||
}
|
||||
|
||||
|
||||
public void visit(ListItem listItem) {
|
||||
|
||||
this.addToChunk(listItem);
|
||||
}
|
||||
|
||||
|
||||
public void visit(OrderedList orderedList) {
|
||||
|
||||
this.addToChunk(orderedList);
|
||||
}
|
||||
|
||||
|
||||
public void visit(Paragraph paragraph) {
|
||||
|
||||
this.addToChunk(paragraph);
|
||||
}
|
||||
|
||||
|
||||
public void visit(SoftLineBreak softLineBreak) {
|
||||
|
||||
this.addToChunk(softLineBreak);
|
||||
}
|
||||
|
||||
|
||||
public void visit(StrongEmphasis strongEmphasis) {
|
||||
|
||||
this.addToChunk(strongEmphasis);
|
||||
}
|
||||
|
||||
|
||||
public void visit(Text text) {
|
||||
|
||||
this.addToChunk(text);
|
||||
}
|
||||
|
||||
|
||||
public void visit(LinkReferenceDefinition linkReferenceDefinition) {
|
||||
|
||||
this.addToChunk(linkReferenceDefinition);
|
||||
}
|
||||
|
||||
|
||||
public void visit(CustomBlock customBlock) {
|
||||
|
||||
this.addToChunk(customBlock);
|
||||
}
|
||||
|
||||
|
||||
public void visit(CustomNode customNode) {
|
||||
|
||||
this.addToChunk(customNode);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,171 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||
|
||||
import org.commonmark.ext.gfm.tables.TableBlock;
|
||||
import org.commonmark.ext.gfm.tables.TableBody;
|
||||
import org.commonmark.ext.gfm.tables.TableCell;
|
||||
import org.commonmark.ext.gfm.tables.TableHead;
|
||||
import org.commonmark.ext.gfm.tables.TableRow;
|
||||
import org.commonmark.node.BlockQuote;
|
||||
import org.commonmark.node.BulletList;
|
||||
import org.commonmark.node.Code;
|
||||
import org.commonmark.node.Document;
|
||||
import org.commonmark.node.Emphasis;
|
||||
import org.commonmark.node.FencedCodeBlock;
|
||||
import org.commonmark.node.HardLineBreak;
|
||||
import org.commonmark.node.Heading;
|
||||
import org.commonmark.node.HtmlBlock;
|
||||
import org.commonmark.node.HtmlInline;
|
||||
import org.commonmark.node.Image;
|
||||
import org.commonmark.node.IndentedCodeBlock;
|
||||
import org.commonmark.node.Link;
|
||||
import org.commonmark.node.LinkReferenceDefinition;
|
||||
import org.commonmark.node.ListItem;
|
||||
import org.commonmark.node.Node;
|
||||
import org.commonmark.node.OrderedList;
|
||||
import org.commonmark.node.Paragraph;
|
||||
import org.commonmark.node.SoftLineBreak;
|
||||
import org.commonmark.node.StrongEmphasis;
|
||||
import org.commonmark.node.Text;
|
||||
import org.commonmark.node.ThematicBreak;
|
||||
import org.mapstruct.Mapper;
|
||||
import org.mapstruct.Mapping;
|
||||
import org.mapstruct.factory.Mappers;
|
||||
|
||||
@Mapper
|
||||
public interface NodeCopier {
|
||||
|
||||
NodeCopier INSTANCE = Mappers.getMapper(NodeCopier.class);
|
||||
|
||||
|
||||
default Node copyNodeWithChildren(Node node) {
|
||||
|
||||
Node copy = copy(node);
|
||||
|
||||
Node next;
|
||||
for (Node child = node.getFirstChild(); child != null; child = next) {
|
||||
next = child.getNext();
|
||||
copy.appendChild(copyNodeWithChildren(child));
|
||||
}
|
||||
return copy;
|
||||
}
|
||||
|
||||
|
||||
default Node copy(Node node) {
|
||||
|
||||
return switch (node.getClass().getSimpleName()) {
|
||||
case "BlockQuote" -> copy((BlockQuote) node);
|
||||
case "BulletList" -> copy((BulletList) node);
|
||||
case "Code" -> copy((Code) node);
|
||||
case "Document" -> copy((Document) node);
|
||||
case "Emphasis" -> copy((Emphasis) node);
|
||||
case "FencedCodeBlock" -> copy((FencedCodeBlock) node);
|
||||
case "HardLineBreak" -> copy((HardLineBreak) node);
|
||||
case "Heading" -> copy((Heading) node);
|
||||
case "HtmlBlock" -> copy((HtmlBlock) node);
|
||||
case "HtmlInline" -> copy((HtmlInline) node);
|
||||
case "Image" -> copy((Image) node);
|
||||
case "IndentedCodeBlock" -> copy((IndentedCodeBlock) node);
|
||||
case "Link" -> copy((Link) node);
|
||||
case "LinkReferenceDefinition" -> copy((LinkReferenceDefinition) node);
|
||||
case "ListItem" -> copy((ListItem) node);
|
||||
case "OrderedList" -> copy((OrderedList) node);
|
||||
case "Paragraph" -> copy((Paragraph) node);
|
||||
case "SoftLineBreak" -> copy((SoftLineBreak) node);
|
||||
case "StrongEmphasis" -> copy((StrongEmphasis) node);
|
||||
case "Text" -> copy((Text) node);
|
||||
case "ThematicBreak" -> copy((ThematicBreak) node);
|
||||
case "TableBlock" -> copy((TableBlock) node);
|
||||
case "TableBody" -> copy((TableBody) node);
|
||||
case "TableCell" -> copy((TableCell) node);
|
||||
case "TableHead" -> copy((TableHead) node);
|
||||
case "TableRow" -> copy((TableRow) node);
|
||||
default -> throw new IllegalArgumentException("No copy method found for class: " + node.getClass().getName());
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
BlockQuote copy(BlockQuote blockQuote);
|
||||
|
||||
|
||||
@Mapping(target = "bulletMarker", ignore = true)
|
||||
BulletList copy(BulletList bulletList);
|
||||
|
||||
|
||||
Code copy(Code code);
|
||||
|
||||
|
||||
Document copy(Document document);
|
||||
|
||||
|
||||
@Mapping(target = "delimiter", source = "openingDelimiter")
|
||||
Emphasis copy(Emphasis emphasis);
|
||||
|
||||
|
||||
@Mapping(target = "fenceChar", ignore = true)
|
||||
@Mapping(target = "fenceLength", ignore = true)
|
||||
FencedCodeBlock copy(FencedCodeBlock fencedCodeBlock);
|
||||
|
||||
|
||||
HardLineBreak copy(HardLineBreak hardLineBreak);
|
||||
|
||||
|
||||
Heading copy(Heading heading);
|
||||
|
||||
|
||||
HtmlBlock copy(HtmlBlock htmlBlock);
|
||||
|
||||
|
||||
HtmlInline copy(HtmlInline htmlInline);
|
||||
|
||||
|
||||
Image copy(Image image);
|
||||
|
||||
|
||||
IndentedCodeBlock copy(IndentedCodeBlock indentedCodeBlock);
|
||||
|
||||
|
||||
Link copy(Link link);
|
||||
|
||||
|
||||
LinkReferenceDefinition copy(LinkReferenceDefinition linkReferenceDefinition);
|
||||
|
||||
|
||||
ListItem copy(ListItem listItem);
|
||||
|
||||
|
||||
@Mapping(target = "startNumber", ignore = true)
|
||||
@Mapping(target = "delimiter", ignore = true)
|
||||
OrderedList copy(OrderedList orderedList);
|
||||
|
||||
|
||||
Paragraph copy(Paragraph paragraph);
|
||||
|
||||
|
||||
SoftLineBreak copy(SoftLineBreak softLineBreak);
|
||||
|
||||
|
||||
@Mapping(target = "delimiter", source = "openingDelimiter")
|
||||
StrongEmphasis copy(StrongEmphasis strongEmphasis);
|
||||
|
||||
|
||||
Text copy(Text text);
|
||||
|
||||
|
||||
ThematicBreak copy(ThematicBreak thematicBreak);
|
||||
|
||||
|
||||
TableBlock copy(TableBlock tableBlock);
|
||||
|
||||
|
||||
TableBody copy(TableBody tableBody);
|
||||
|
||||
|
||||
TableCell copy(TableCell tableCell);
|
||||
|
||||
|
||||
TableHead copy(TableHead tableHead);
|
||||
|
||||
|
||||
TableRow copy(TableRow tableRow);
|
||||
|
||||
}
|
||||
@ -0,0 +1,65 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
|
||||
import org.commonmark.node.Node;
|
||||
|
||||
public class ReflectionNodeCopier {
|
||||
|
||||
NodeCopier mapperNodeCopier;
|
||||
|
||||
|
||||
|
||||
public static Node copyNode(Node node) {
|
||||
|
||||
Node copy = deepCopy(node);
|
||||
copyChildren(node, copy);
|
||||
return copy;
|
||||
}
|
||||
|
||||
|
||||
private static void copyChildren(Node nodeToCopy, Node copy) {
|
||||
|
||||
Node next;
|
||||
for (Node node = nodeToCopy.getFirstChild(); node != null; node = next) {
|
||||
next = node.getNext();
|
||||
copy.appendChild(copyNode(node));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static <T> T deepCopy(T object) {
|
||||
|
||||
try {
|
||||
Class<?> clazz = object.getClass();
|
||||
T copy = (T) clazz.getDeclaredConstructor().newInstance();
|
||||
|
||||
for (Field field : clazz.getDeclaredFields()) {
|
||||
field.setAccessible(true);
|
||||
Object value = field.get(object);
|
||||
if (isPrimitiveOrWrapper(field.getType()) || field.getType().equals(String.class)) {
|
||||
field.set(copy, value);
|
||||
}
|
||||
}
|
||||
return copy;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static boolean isPrimitiveOrWrapper(Class<?> type) {
|
||||
|
||||
return type.isPrimitive()
|
||||
|| type == Boolean.class
|
||||
|| type == Byte.class
|
||||
|| type == Character.class
|
||||
|| type == Double.class
|
||||
|| type == Float.class
|
||||
|| type == Integer.class
|
||||
|| type == Long.class
|
||||
|| type == Short.class;
|
||||
}
|
||||
|
||||
}
|
||||
@ -7,6 +7,8 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.base.Functions;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
@ -59,7 +61,9 @@ public class FloatFrequencyCounter {
|
||||
}
|
||||
}
|
||||
|
||||
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
|
||||
return higher.stream()
|
||||
.sorted(Collections.reverseOrder())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -74,4 +78,16 @@ public class FloatFrequencyCounter {
|
||||
return highest;
|
||||
}
|
||||
|
||||
|
||||
public double getAverage() {
|
||||
|
||||
double sum = countPerValue.keySet()
|
||||
.stream()
|
||||
.mapToDouble(fontSize -> fontSize * countPerValue.get(fontSize)).sum();
|
||||
double count = countPerValue.values()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue).sum();
|
||||
return sum / count;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -60,8 +60,8 @@ public class Document extends AbstractSemanticNode {
|
||||
*
|
||||
* @return A list of main sections within the document
|
||||
* @deprecated This method is marked for removal.
|
||||
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
||||
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
||||
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
||||
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
||||
*/
|
||||
@Deprecated(forRemoval = true)
|
||||
public List<Section> getMainSections() {
|
||||
|
||||
@ -42,7 +42,9 @@ public interface SemanticNode {
|
||||
*/
|
||||
default TextBlock getTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@ -68,7 +70,10 @@ public interface SemanticNode {
|
||||
|
||||
default Page getFirstPage() {
|
||||
|
||||
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
return getTextBlock().getPages()
|
||||
.stream()
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
}
|
||||
|
||||
|
||||
@ -88,7 +93,8 @@ public interface SemanticNode {
|
||||
|
||||
default boolean isOnPage(int pageNumber) {
|
||||
|
||||
return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber);
|
||||
return getPages().stream()
|
||||
.anyMatch(page -> page.getNumber() == pageNumber);
|
||||
}
|
||||
|
||||
|
||||
@ -203,7 +209,9 @@ public interface SemanticNode {
|
||||
*/
|
||||
default boolean hasEntitiesOfType(String type) {
|
||||
|
||||
return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
|
||||
return getEntities().stream()
|
||||
.filter(entity -> entity.getEntityType().equals(EntityType.ENTITY))
|
||||
.anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
|
||||
}
|
||||
|
||||
|
||||
@ -215,7 +223,9 @@ public interface SemanticNode {
|
||||
*/
|
||||
default List<RedactionEntity> getEntitiesOfType(String type) {
|
||||
|
||||
return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList();
|
||||
return getEntities().stream()
|
||||
.filter(redactionEntity -> redactionEntity.getType().equals(type))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -227,7 +237,9 @@ public interface SemanticNode {
|
||||
*/
|
||||
default List<RedactionEntity> getEntitiesOfType(List<String> types) {
|
||||
|
||||
return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList();
|
||||
return getEntities().stream()
|
||||
.filter(redactionEntity -> redactionEntity.isAnyType(types))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -241,7 +253,8 @@ public interface SemanticNode {
|
||||
|
||||
TextBlock textBlock = getTextBlock();
|
||||
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
|
||||
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
|
||||
return getTextBlock().getAtomicTextBlocks()
|
||||
.get(0).getNumberOnPage();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
@ -279,7 +292,8 @@ public interface SemanticNode {
|
||||
*/
|
||||
default boolean containsStrings(List<String> strings) {
|
||||
|
||||
return strings.stream().allMatch(this::containsString);
|
||||
return strings.stream()
|
||||
.allMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
@ -303,7 +317,8 @@ public interface SemanticNode {
|
||||
*/
|
||||
default boolean containsAnyString(List<String> strings) {
|
||||
|
||||
return strings.stream().anyMatch(this::containsString);
|
||||
return strings.stream()
|
||||
.anyMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
@ -315,7 +330,8 @@ public interface SemanticNode {
|
||||
*/
|
||||
default boolean containsAnyStringIgnoreCase(List<String> strings) {
|
||||
|
||||
return strings.stream().anyMatch(this::containsStringIgnoreCase);
|
||||
return strings.stream()
|
||||
.anyMatch(this::containsStringIgnoreCase);
|
||||
}
|
||||
|
||||
|
||||
@ -386,7 +402,8 @@ public interface SemanticNode {
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodes() {
|
||||
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode);
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId())
|
||||
.map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@ -397,7 +414,9 @@ public interface SemanticNode {
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
|
||||
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode);
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId())
|
||||
.filter(entry -> entry.getType().equals(nodeType))
|
||||
.map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@ -454,8 +473,16 @@ public interface SemanticNode {
|
||||
private Map<Page, Rectangle2D> getBBoxFromChildren() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList();
|
||||
Set<Page> pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet());
|
||||
|
||||
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren() //
|
||||
.filter(SemanticNode::isNotOcrImage)
|
||||
.map(SemanticNode::getBBox)
|
||||
.toList();
|
||||
|
||||
Set<Page> pages = childrenBBoxes.stream()
|
||||
.flatMap(map -> map.keySet()
|
||||
.stream())
|
||||
.collect(Collectors.toSet());
|
||||
for (Page page : pages) {
|
||||
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
|
||||
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
||||
@ -467,13 +494,24 @@ public interface SemanticNode {
|
||||
}
|
||||
|
||||
|
||||
private static boolean isNotOcrImage(SemanticNode node) {
|
||||
|
||||
if (!node.getType().equals(NodeType.IMAGE)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The union of all BoundingBoxes of the TextBlock of this node
|
||||
*/
|
||||
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
@ -109,10 +109,7 @@ public class AtomicTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
|
||||
DocumentPositionData documentPositionData,
|
||||
SemanticNode parent,
|
||||
Page page) {
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(documentTextData.getId())
|
||||
@ -120,8 +117,10 @@ public class AtomicTextBlock implements TextBlock {
|
||||
.page(page)
|
||||
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
|
||||
.searchText(documentTextData.getSearchText())
|
||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
|
||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed()
|
||||
.toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed()
|
||||
.toList())
|
||||
.positions(toRectangle2DList(documentPositionData.getPositions()))
|
||||
.parent(parent)
|
||||
.build();
|
||||
@ -130,7 +129,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
||||
|
||||
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
|
||||
return Arrays.stream(positions)
|
||||
.map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3]))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -159,9 +160,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + boundary.start();
|
||||
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@ -169,9 +170,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + boundary.start();
|
||||
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@ -219,7 +220,10 @@ public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
private List<Integer> getAllLineBreaksInBoundary(Boundary boundary) {
|
||||
|
||||
return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList();
|
||||
return getLineBreaks().stream()
|
||||
.map(linebreak -> linebreak + this.boundary.start())
|
||||
.filter(boundary::contains)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,30 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline.buildRenderer;
|
||||
|
||||
import org.commonmark.node.Node;
|
||||
import org.commonmark.renderer.markdown.MarkdownRenderer;
|
||||
|
||||
import com.didalgo.gpt3.Encoding;
|
||||
import com.didalgo.gpt3.GPT3Tokenizer;
|
||||
import com.didalgo.gpt3.TokenCount;
|
||||
|
||||
public class TokenCounter {
|
||||
|
||||
private static final GPT3Tokenizer tokenizer = new GPT3Tokenizer(Encoding.CL100K_BASE);
|
||||
|
||||
|
||||
public static int countTokens(Node node) {
|
||||
|
||||
MarkdownRenderer renderer = buildRenderer();
|
||||
String markdownResult = renderer.render(node);
|
||||
return countTokens(markdownResult);
|
||||
}
|
||||
|
||||
|
||||
public static synchronized int countTokens(String text) {
|
||||
|
||||
return TokenCount.fromString(text, tokenizer);
|
||||
}
|
||||
|
||||
}
|
||||
@ -10,6 +10,7 @@ import org.springframework.amqp.core.Message;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.boot.actuate.logging.LogFileWebEndpoint;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
@ -18,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -29,9 +31,11 @@ import lombok.extern.slf4j.Slf4j;
|
||||
public class MessageHandler {
|
||||
|
||||
private final LayoutParsingPipeline layoutParsingPipeline;
|
||||
private final MarkdownParsingPipeline markdownParsingPipeline;
|
||||
private final ObjectMapper objectMapper;
|
||||
private final RabbitTemplate rabbitTemplate;
|
||||
private final static String X_PIPELINE_PREFIX = "X-PIPE-";
|
||||
private final LogFileWebEndpoint logFileWebEndpoint;
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@ -41,30 +45,30 @@ public class MessageHandler {
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class);
|
||||
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND) && layoutParsingRequest.researchDocumentStorageId() == null) {
|
||||
throw new IllegalArgumentException("ResearchDocumentDataStorageId is null!");
|
||||
}
|
||||
log.info("Layout parsing request received {}", layoutParsingRequest.identifier());
|
||||
if (message.getMessageProperties().isRedelivered()) {
|
||||
throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.",
|
||||
layoutParsingRequest.identifier()));
|
||||
layoutParsingRequest.identifier()));
|
||||
}
|
||||
LayoutParsingFinishedEvent layoutParsingFinishedEvent;
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.MARKDOWN)) {
|
||||
layoutParsingFinishedEvent = markdownParsingPipeline.parseMarkdownAndSaveToStorage(layoutParsingRequest);
|
||||
} else {
|
||||
layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
}
|
||||
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent, message);
|
||||
}
|
||||
|
||||
|
||||
public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent, Message message) {
|
||||
|
||||
Arrays.stream(layoutParsingFinishedEvent.message().split("\n")).forEach(log::info);
|
||||
Arrays.stream(layoutParsingFinishedEvent.message().split("\n"))
|
||||
.forEach(log::info);
|
||||
rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent, m -> {
|
||||
var forwardHeaders = message.getMessageProperties()
|
||||
.getHeaders()
|
||||
.entrySet()
|
||||
var forwardHeaders = message.getMessageProperties().getHeaders().entrySet()
|
||||
.stream()
|
||||
.filter(e -> e.getKey().toUpperCase(Locale.ROOT).startsWith(X_PIPELINE_PREFIX))
|
||||
.collect(Collectors.toMap(Map.Entry::getKey,
|
||||
Map.Entry::getValue));
|
||||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
|
||||
m.getMessageProperties().getHeaders().putAll(forwardHeaders);
|
||||
return m;
|
||||
});
|
||||
|
||||
@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEndWithFolder() {
|
||||
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/syn-dm-single-digit-headlines";
|
||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.sorted(Comparator.comparing(Path::getFileName))
|
||||
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
file = new File(filePath);
|
||||
}
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.DOCUMINE_OLD, true);
|
||||
prepareStorage(layoutParsingRequest, file);
|
||||
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
|
||||
@ -0,0 +1,54 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.MarkdownParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class MarkdownParsingPipelineTest {
|
||||
|
||||
static String TENANT = "tenant";
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
FileSystemBackedStorageService storageService = new FileSystemBackedStorageService(mapper);
|
||||
MarkdownParsingPipeline markdownParsingPipeline = new MarkdownParsingPipeline(storageService);
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void parseMarkdownsFromFolder() {
|
||||
|
||||
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/confluence_dump/");
|
||||
Files.walk(file)
|
||||
.filter(path -> path.getFileName().toFile().toString().endsWith(".md"))
|
||||
.peek(System.out::println)
|
||||
.forEach(this::parseMarkdown);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void parseMarkdown(Path file) {
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = AbstractTest.buildDefaultLayoutParsingRequest(file.getFileName().toFile().toString(), LayoutParsingType.MARKDOWN, true);
|
||||
|
||||
try (var in = new FileInputStream(file.toFile())) {
|
||||
storageService.storeObject(TENANT, layoutParsingRequest.originFileStorageId(), in);
|
||||
}
|
||||
|
||||
markdownParsingPipeline.parseMarkdownAndSaveToStorage(layoutParsingRequest);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,5 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
dir=${PWD##*/}
|
||||
|
||||
gradle assemble
|
||||
|
||||
# Get the current Git branch
|
||||
@ -11,5 +15,32 @@ commit_hash=$(git rev-parse --short=5 HEAD)
|
||||
# Combine branch and commit hash
|
||||
buildName="${USER}-${branch}-${commit_hash}"
|
||||
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
||||
echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName"
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName}
|
||||
|
||||
newImageName="nexus.knecon.com:5001/ff/layoutparser-service-server:${buildName}"
|
||||
|
||||
echo "full image name:"
|
||||
echo ${newImageName}
|
||||
echo ""
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
namespace=${1}
|
||||
deployment_name="layoutparser-service"
|
||||
|
||||
echo "deploying to ${namespace}"
|
||||
|
||||
oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}')
|
||||
|
||||
if [ "${newImageName}" = "${oldImageName}" ]; then
|
||||
echo "Image tag of ${deployment_name} did not change, redeploying..."
|
||||
rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace}
|
||||
else
|
||||
echo "upgrading the image tag of ${deployment_name}..."
|
||||
rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace}
|
||||
fi
|
||||
|
||||
rancher kubectl rollout status deployment ${deployment_name} -n ${namespace}
|
||||
echo "Deployed ${deployment_name}:${buildName} to ${namespace}"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user