Merge branch 'AZURE_NER' into 'main'

RED-9918: Azure entity recognition (Spike)

See merge request fforesight/layout-parser!196
This commit is contained in:
Maverick Studer 2024-08-26 14:34:46 +02:00
commit b2fa14dde2
2 changed files with 34 additions and 1 deletions

View File

@ -21,5 +21,14 @@ public class SimplifiedText {
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
@Builder.Default
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
@Schema(description = "A list of the main section numbers ")
@Builder.Default
private List<String> mainSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the header section numbers ")
@Builder.Default
private List<String> headerSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the footer section numbers ")
@Builder.Default
private List<String> footerSectionNumbers = new ArrayList<>();
}

View File

@ -34,7 +34,22 @@ public class SimplifiedSectionTextService {
List<SimplifiedSectionText> simplifiedText = Stream.of(simplifiedMainSectionsList, simplifiedHeadersList, simplifiedFootersList)
.flatMap(List::stream)
.collect(Collectors.toList());
return SimplifiedText.builder().numberOfPages(document.getNumberOfPages()).sectionTexts(simplifiedText).build();
return SimplifiedText.builder()
.numberOfPages(document.getNumberOfPages())
.sectionTexts(simplifiedText)
.mainSectionNumbers(document.getAllSections()
.stream()
.map(this::getSectionNumber)
.toList())
.headerSectionNumbers(document.getHeaders()
.stream()
.map(this::getSectionNumber)
.toList())
.footerSectionNumbers(document.getFooters()
.stream()
.map(this::getSectionNumber)
.toList())
.build();
}
@ -49,4 +64,13 @@ public class SimplifiedSectionTextService {
.build();
}
private String getSectionNumber(SemanticNode semanticNode) {
return semanticNode.getTreeId()
.stream()
.map(String::valueOf)
.collect(Collectors.joining("."));
}
}