DM-307: Improved paragraph splitting #50

Merged
dominique.eiflaender1 merged 1 commits from DM-307 into master 2023-07-14 14:19:28 +02:00
2 changed files with 4 additions and 3 deletions

View File

@ -49,7 +49,7 @@ public class DocuMineBlockificationService implements BlockificationService{
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.5;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;

View File

@ -45,9 +45,10 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
@Disabled
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/402-16_Fantom_ToxicidadeCutaneaAguda.pdf");
AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).pdf",
"files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).json");
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).pdf",
// "files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).json");
System.out.println("Start Full integration test");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));