DM-307: Undo lineheight experiment #53

Merged
dominique.eiflaender1 merged 1 commits from DM-307 into master 2023-07-17 11:42:12 +02:00
5 changed files with 833 additions and 6 deletions

View File

@ -6,6 +6,8 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
@ -49,16 +51,16 @@ public class DocuMineBlockificationService implements BlockificationService{
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.5;
boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY)) {
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) {
Orientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {

View File

@ -45,10 +45,10 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
@Disabled
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/ITEM 23_A19022A - Dermal Absorption Human.pdf");
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/ProblemDocs/F.2. A16003E - Acute Inhalation Study.pdf");
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).pdf",
// "files/Documine/Flora/ProblemDocs/23_In Vitro Percutaneous Absorption - Human Split-Thickness Skin (1).json");
// AnalyzeRequest request = prepareStorage("files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).pdf",
// "files/Documine/Flora/ProblemDocs/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (1).TABLES.json");
System.out.println("Start Full integration test");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));

View File

@ -339,6 +339,17 @@ rule "DOC.7.1: study title"
end
rule "DOC.7.2: study title"
when
not Table(isOnPage(1), (containsString("Final Report") || containsString("SPL")))
$section: Section(isOnPage(1), (containsString("Final Report") || containsString("SPL")))
then
entityCreationService.byRegexWithLineBreaksIgnoreCase("(?<=\\n)[\\w\\W]{1,300}(?=\\nFinal Report)", "title", EntityType.ENTITY, $section).findFirst().ifPresent(entity -> {
entity.apply("DOC.7.2", "Title found", "n-a");
});
end
rule "DOC.8.1: Performing Laboratory (Name)"
when
$section: Section(containsString("PERFORMING LABORATORY:"))

View File

@ -0,0 +1,814 @@
{
"dossierId": "01413f1d-0d24-4fe9-ac5b-9e66ed8a3417",
"fileId": "70fad69fbef6e0570b23b683e857e0d9",
"targetFileExtension": "ORIGIN.pdf.gz",
"responseFileExtension": "TABLES.json.gz",
"data": [
{
"pageInfo": {
"number": 3,
"rotation": 270,
"width": 595.260009765625,
"height": 841.8599853515625
},
"tableCells": [
{
"x0": 9.89996337890625,
"y0": 82.97998046875,
"x1": 93.41998291015625,
"y1": 390.780029296875,
"width": 83.52001953125,
"height": 307.800048828125
},
{
"x0": 8.4599609375,
"y0": 391.1400146484375,
"x1": 90.89996337890625,
"y1": 497.70001220703125,
"width": 82.44000244140625,
"height": 106.55999755859375
}
]
},
{
"pageInfo": {
"number": 5,
"rotation": 270,
"width": 595.260009765625,
"height": 841.8599853515625
},
"tableCells": [
{
"x0": 517.5,
"y0": 356.94000244140625,
"x1": 558.9000244140625,
"y1": 504.5400085449219,
"width": 41.4000244140625,
"height": 147.60000610351562
},
{
"x0": 517.5,
"y0": 223.02001953125,
"x1": 558.9000244140625,
"y1": 356.22003173828125,
"width": 41.4000244140625,
"height": 133.20001220703125
},
{
"x0": 517.5,
"y0": 74.34002685546875,
"x1": 558.9000244140625,
"y1": 222.300048828125,
"width": 41.4000244140625,
"height": 147.96002197265625
},
{
"x0": 503.4599914550781,
"y0": 357.29998779296875,
"x1": 517.1400146484375,
"y1": 504.5400085449219,
"width": 13.680023193359375,
"height": 147.24002075195312
},
{
"x0": 503.4599914550781,
"y0": 223.02001953125,
"x1": 517.1400146484375,
"y1": 356.22003173828125,
"width": 13.680023193359375,
"height": 133.20001220703125
},
{
"x0": 503.0999755859375,
"y0": 74.34002685546875,
"x1": 517.1400146484375,
"y1": 222.300048828125,
"width": 14.0400390625,
"height": 147.96002197265625
},
{
"x0": 475.3799743652344,
"y0": 357.29998779296875,
"x1": 502.739990234375,
"y1": 504.5400085449219,
"width": 27.360015869140625,
"height": 147.24002075195312
},
{
"x0": 475.3799743652344,
"y0": 223.02001953125,
"x1": 502.739990234375,
"y1": 356.22003173828125,
"width": 27.360015869140625,
"height": 133.20001220703125
},
{
"x0": 475.0199890136719,
"y0": 74.34002685546875,
"x1": 502.739990234375,
"y1": 222.300048828125,
"width": 27.720001220703125,
"height": 147.96002197265625
},
{
"x0": 460.97998046875,
"y0": 357.29998779296875,
"x1": 474.65997314453125,
"y1": 504.5400085449219,
"width": 13.67999267578125,
"height": 147.24002075195312
},
{
"x0": 460.97998046875,
"y0": 223.02001953125,
"x1": 474.65997314453125,
"y1": 356.22003173828125,
"width": 13.67999267578125,
"height": 133.20001220703125
},
{
"x0": 460.97998046875,
"y0": 74.34002685546875,
"x1": 474.65997314453125,
"y1": 222.300048828125,
"width": 13.67999267578125,
"height": 147.96002197265625
},
{
"x0": 10.6199951171875,
"y0": 85.1400146484375,
"x1": 78.29998779296875,
"y1": 198.1800537109375,
"width": 67.67999267578125,
"height": 113.0400390625
},
{
"x0": 9.53997802734375,
"y0": 462.05999755859375,
"x1": 38.70001220703125,
"y1": 499.1400146484375,
"width": 29.1600341796875,
"height": 37.08001708984375
},
{
"x0": -0.17999267578125,
"y0": 85.1400146484375,
"x1": 10.260009765625,
"y1": 499.1400146484375,
"width": 10.44000244140625,
"height": 414.0
}
]
},
{
"pageInfo": {
"number": 6,
"rotation": 270,
"width": 595.260009765625,
"height": 841.8599853515625
},
"tableCells": [
{
"x0": 10.260009765625,
"y0": 340.739990234375,
"x1": 91.97998046875,
"y1": 500.94000244140625,
"width": 81.719970703125,
"height": 160.20001220703125
},
{
"x0": 10.260009765625,
"y0": 115.02001953125,
"x1": 91.97998046875,
"y1": 340.3800048828125,
"width": 81.719970703125,
"height": 225.3599853515625
},
{
"x0": 10.260009765625,
"y0": 86.58001708984375,
"x1": 91.97998046875,
"y1": 114.6600341796875,
"width": 81.719970703125,
"height": 28.08001708984375
},
{
"x0": -0.17999267578125,
"y0": 86.58001708984375,
"x1": 9.89996337890625,
"y1": 500.58001708984375,
"width": 10.0799560546875,
"height": 414.0
}
]
},
{
"pageInfo": {
"number": 15,
"rotation": 0,
"width": 595.3200073242188,
"height": 842.0399780273438
},
"tableCells": [
{
"x0": 97.55999755859375,
"y0": 382.67999267578125,
"x1": 526.6799926757812,
"y1": 394.5599670410156,
"width": 429.1199951171875,
"height": 11.879974365234375
},
{
"x0": 97.55999755859375,
"y0": 370.0799865722656,
"x1": 526.6799926757812,
"y1": 381.9599914550781,
"width": 429.1199951171875,
"height": 11.8800048828125
},
{
"x0": 97.55999755859375,
"y0": 356.39996337890625,
"x1": 202.67999267578125,
"y1": 368.27996826171875,
"width": 105.1199951171875,
"height": 11.8800048828125
},
{
"x0": 204.1199951171875,
"y0": 356.39996337890625,
"x1": 357.1199951171875,
"y1": 368.27996826171875,
"width": 153.0,
"height": 11.8800048828125
},
{
"x0": 358.9200134277344,
"y0": 356.39996337890625,
"x1": 526.6799926757812,
"y1": 368.27996826171875,
"width": 167.75997924804688,
"height": 11.8800048828125
},
{
"x0": 97.55999755859375,
"y0": 343.79998779296875,
"x1": 202.67999267578125,
"y1": 355.67999267578125,
"width": 105.1199951171875,
"height": 11.8800048828125
},
{
"x0": 204.1199951171875,
"y0": 343.79998779296875,
"x1": 357.1199951171875,
"y1": 355.67999267578125,
"width": 153.0,
"height": 11.8800048828125
},
{
"x0": 358.9200134277344,
"y0": 343.79998779296875,
"x1": 526.6799926757812,
"y1": 355.67999267578125,
"width": 167.75997924804688,
"height": 11.8800048828125
},
{
"x0": 97.55999755859375,
"y0": 330.1199645996094,
"x1": 202.67999267578125,
"y1": 341.9999694824219,
"width": 105.1199951171875,
"height": 11.8800048828125
},
{
"x0": 204.1199951171875,
"y0": 330.1199645996094,
"x1": 357.1199951171875,
"y1": 341.9999694824219,
"width": 153.0,
"height": 11.8800048828125
},
{
"x0": 358.9200134277344,
"y0": 330.1199645996094,
"x1": 526.6799926757812,
"y1": 341.9999694824219,
"width": 167.75997924804688,
"height": 11.8800048828125
},
{
"x0": 97.55999755859375,
"y0": 317.51995849609375,
"x1": 202.67999267578125,
"y1": 329.39996337890625,
"width": 105.1199951171875,
"height": 11.8800048828125
},
{
"x0": 204.1199951171875,
"y0": 317.51995849609375,
"x1": 357.1199951171875,
"y1": 329.39996337890625,
"width": 153.0,
"height": 11.8800048828125
},
{
"x0": 358.9200134277344,
"y0": 317.51995849609375,
"x1": 526.6799926757812,
"y1": 329.39996337890625,
"width": 167.75997924804688,
"height": 11.8800048828125
},
{
"x0": 97.55999755859375,
"y0": 303.8399658203125,
"x1": 202.67999267578125,
"y1": 315.719970703125,
"width": 105.1199951171875,
"height": 11.8800048828125
},
{
"x0": 204.1199951171875,
"y0": 303.8399658203125,
"x1": 357.1199951171875,
"y1": 315.719970703125,
"width": 153.0,
"height": 11.8800048828125
},
{
"x0": 358.9200134277344,
"y0": 303.8399658203125,
"x1": 526.6799926757812,
"y1": 315.719970703125,
"width": 167.75997924804688,
"height": 11.8800048828125
},
{
"x0": 97.55999755859375,
"y0": 291.239990234375,
"x1": 202.67999267578125,
"y1": 303.1199951171875,
"width": 105.1199951171875,
"height": 11.8800048828125
},
{
"x0": 204.1199951171875,
"y0": 291.239990234375,
"x1": 357.1199951171875,
"y1": 303.1199951171875,
"width": 153.0,
"height": 11.8800048828125
},
{
"x0": 358.55999755859375,
"y0": 291.239990234375,
"x1": 526.6799926757812,
"y1": 303.1199951171875,
"width": 168.1199951171875,
"height": 11.8800048828125
},
{
"x0": 97.55999755859375,
"y0": 277.55999755859375,
"x1": 202.67999267578125,
"y1": 289.44000244140625,
"width": 105.1199951171875,
"height": 11.8800048828125
},
{
"x0": 204.1199951171875,
"y0": 277.55999755859375,
"x1": 357.1199951171875,
"y1": 289.44000244140625,
"width": 153.0,
"height": 11.8800048828125
},
{
"x0": 358.55999755859375,
"y0": 277.55999755859375,
"x1": 526.6799926757812,
"y1": 289.44000244140625,
"width": 168.1199951171875,
"height": 11.8800048828125
}
]
},
{
"pageInfo": {
"number": 36,
"rotation": 0,
"width": 595.3200073242188,
"height": 842.0399780273438
},
"tableCells": [
{
"x0": 91.80000305175781,
"y0": 601.9199829101562,
"x1": 148.32000732421875,
"y1": 630.3599853515625,
"width": 56.52000427246094,
"height": 28.44000244140625
},
{
"x0": 150.83999633789062,
"y0": 601.9199829101562,
"x1": 244.44000244140625,
"y1": 630.3599853515625,
"width": 93.60000610351562,
"height": 28.44000244140625
},
{
"x0": 246.9600067138672,
"y0": 601.9199829101562,
"x1": 386.2799987792969,
"y1": 630.3599853515625,
"width": 139.3199920654297,
"height": 28.44000244140625
},
{
"x0": 389.5199890136719,
"y0": 601.9199829101562,
"x1": 535.6799926757812,
"y1": 630.3599853515625,
"width": 146.16000366210938,
"height": 28.44000244140625
},
{
"x0": 91.80000305175781,
"y0": 550.7999877929688,
"x1": 148.67999267578125,
"y1": 599.3999633789062,
"width": 56.87998962402344,
"height": 48.5999755859375
},
{
"x0": 150.83999633789062,
"y0": 550.7999877929688,
"x1": 244.44000244140625,
"y1": 599.3999633789062,
"width": 93.60000610351562,
"height": 48.5999755859375
},
{
"x0": 246.9600067138672,
"y0": 550.7999877929688,
"x1": 386.2799987792969,
"y1": 599.3999633789062,
"width": 139.3199920654297,
"height": 48.5999755859375
},
{
"x0": 389.5199890136719,
"y0": 550.7999877929688,
"x1": 535.6799926757812,
"y1": 599.3999633789062,
"width": 146.16000366210938,
"height": 48.5999755859375
},
{
"x0": 91.80000305175781,
"y0": 500.7599792480469,
"x1": 148.32000732421875,
"y1": 548.6400146484375,
"width": 56.52000427246094,
"height": 47.880035400390625
},
{
"x0": 150.83999633789062,
"y0": 500.7599792480469,
"x1": 244.44000244140625,
"y1": 548.6400146484375,
"width": 93.60000610351562,
"height": 47.880035400390625
},
{
"x0": 246.9600067138672,
"y0": 500.7599792480469,
"x1": 386.2799987792969,
"y1": 548.6400146484375,
"width": 139.3199920654297,
"height": 47.880035400390625
},
{
"x0": 389.5199890136719,
"y0": 500.7599792480469,
"x1": 535.6799926757812,
"y1": 548.6400146484375,
"width": 146.16000366210938,
"height": 47.880035400390625
},
{
"x0": 91.80000305175781,
"y0": 449.6399841308594,
"x1": 148.32000732421875,
"y1": 498.239990234375,
"width": 56.52000427246094,
"height": 48.600006103515625
},
{
"x0": 150.83999633789062,
"y0": 449.6399841308594,
"x1": 244.44000244140625,
"y1": 498.239990234375,
"width": 93.60000610351562,
"height": 48.600006103515625
},
{
"x0": 246.9600067138672,
"y0": 449.6399841308594,
"x1": 386.2799987792969,
"y1": 498.239990234375,
"width": 139.3199920654297,
"height": 48.600006103515625
},
{
"x0": 389.5199890136719,
"y0": 449.6399841308594,
"x1": 535.6799926757812,
"y1": 498.239990234375,
"width": 146.16000366210938,
"height": 48.600006103515625
},
{
"x0": 91.80000305175781,
"y0": 399.239990234375,
"x1": 148.32000732421875,
"y1": 447.8399658203125,
"width": 56.52000427246094,
"height": 48.5999755859375
},
{
"x0": 150.83999633789062,
"y0": 399.239990234375,
"x1": 244.44000244140625,
"y1": 447.8399658203125,
"width": 93.60000610351562,
"height": 48.5999755859375
},
{
"x0": 246.9600067138672,
"y0": 399.239990234375,
"x1": 386.2799987792969,
"y1": 447.8399658203125,
"width": 139.3199920654297,
"height": 48.5999755859375
},
{
"x0": 389.5199890136719,
"y0": 399.239990234375,
"x1": 535.6799926757812,
"y1": 447.8399658203125,
"width": 146.16000366210938,
"height": 48.5999755859375
},
{
"x0": 91.80000305175781,
"y0": 348.8399658203125,
"x1": 148.67999267578125,
"y1": 396.719970703125,
"width": 56.87998962402344,
"height": 47.8800048828125
},
{
"x0": 150.83999633789062,
"y0": 348.8399658203125,
"x1": 244.44000244140625,
"y1": 396.719970703125,
"width": 93.60000610351562,
"height": 47.8800048828125
},
{
"x0": 246.9600067138672,
"y0": 348.8399658203125,
"x1": 386.2799987792969,
"y1": 396.719970703125,
"width": 139.3199920654297,
"height": 47.8800048828125
},
{
"x0": 389.5199890136719,
"y0": 348.8399658203125,
"x1": 535.6799926757812,
"y1": 396.719970703125,
"width": 146.16000366210938,
"height": 47.8800048828125
},
{
"x0": 91.80000305175781,
"y0": 298.0799560546875,
"x1": 148.67999267578125,
"y1": 346.67999267578125,
"width": 56.87998962402344,
"height": 48.60003662109375
},
{
"x0": 150.83999633789062,
"y0": 298.0799560546875,
"x1": 244.44000244140625,
"y1": 346.67999267578125,
"width": 93.60000610351562,
"height": 48.60003662109375
},
{
"x0": 246.9600067138672,
"y0": 298.0799560546875,
"x1": 386.2799987792969,
"y1": 346.67999267578125,
"width": 139.3199920654297,
"height": 48.60003662109375
},
{
"x0": 389.5199890136719,
"y0": 298.0799560546875,
"x1": 535.6799926757812,
"y1": 346.67999267578125,
"width": 146.16000366210938,
"height": 48.60003662109375
}
]
},
{
"pageInfo": {
"number": 38,
"rotation": 0,
"width": 595.3200073242188,
"height": 842.0399780273438
},
"tableCells": [
{
"x0": 136.44000244140625,
"y0": 497.15997314453125,
"x1": 213.47999572753906,
"y1": 529.9199829101562,
"width": 77.03999328613281,
"height": 32.760009765625
},
{
"x0": 213.83999633789062,
"y0": 497.15997314453125,
"x1": 266.760009765625,
"y1": 529.9199829101562,
"width": 52.920013427734375,
"height": 32.760009765625
},
{
"x0": 267.4800109863281,
"y0": 497.15997314453125,
"x1": 364.67999267578125,
"y1": 529.9199829101562,
"width": 97.19998168945312,
"height": 32.760009765625
},
{
"x0": 136.44000244140625,
"y0": 455.03997802734375,
"x1": 213.47999572753906,
"y1": 496.79998779296875,
"width": 77.03999328613281,
"height": 41.760009765625
},
{
"x0": 213.83999633789062,
"y0": 475.91998291015625,
"x1": 266.760009765625,
"y1": 496.79998779296875,
"width": 52.920013427734375,
"height": 20.8800048828125
},
{
"x0": 267.4800109863281,
"y0": 475.91998291015625,
"x1": 364.67999267578125,
"y1": 496.79998779296875,
"width": 97.19998168945312,
"height": 20.8800048828125
},
{
"x0": 213.83999633789062,
"y0": 455.03997802734375,
"x1": 266.3999938964844,
"y1": 475.5599670410156,
"width": 52.55999755859375,
"height": 20.519989013671875
},
{
"x0": 267.1199951171875,
"y0": 455.03997802734375,
"x1": 364.67999267578125,
"y1": 475.5599670410156,
"width": 97.55999755859375,
"height": 20.519989013671875
},
{
"x0": 136.44000244140625,
"y0": 412.91998291015625,
"x1": 213.47999572753906,
"y1": 454.3199768066406,
"width": 77.03999328613281,
"height": 41.399993896484375
},
{
"x0": 213.83999633789062,
"y0": 433.79998779296875,
"x1": 266.3999938964844,
"y1": 454.3199768066406,
"width": 52.55999755859375,
"height": 20.519989013671875
},
{
"x0": 267.1199951171875,
"y0": 433.79998779296875,
"x1": 364.67999267578125,
"y1": 454.3199768066406,
"width": 97.55999755859375,
"height": 20.519989013671875
},
{
"x0": 213.83999633789062,
"y0": 412.91998291015625,
"x1": 266.3999938964844,
"y1": 433.4399719238281,
"width": 52.55999755859375,
"height": 20.519989013671875
},
{
"x0": 267.1199951171875,
"y0": 412.91998291015625,
"x1": 364.67999267578125,
"y1": 433.4399719238281,
"width": 97.55999755859375,
"height": 20.519989013671875
},
{
"x0": 136.44000244140625,
"y0": 370.4399719238281,
"x1": 213.47999572753906,
"y1": 412.1999816894531,
"width": 77.03999328613281,
"height": 41.760009765625
},
{
"x0": 213.83999633789062,
"y0": 392.03997802734375,
"x1": 266.3999938964844,
"y1": 412.1999816894531,
"width": 52.55999755859375,
"height": 20.160003662109375
},
{
"x0": 267.1199951171875,
"y0": 392.03997802734375,
"x1": 364.67999267578125,
"y1": 412.1999816894531,
"width": 97.55999755859375,
"height": 20.160003662109375
},
{
"x0": 213.83999633789062,
"y0": 370.79998779296875,
"x1": 266.3999938964844,
"y1": 391.3199768066406,
"width": 52.55999755859375,
"height": 20.519989013671875
},
{
"x0": 267.1199951171875,
"y0": 370.79998779296875,
"x1": 364.67999267578125,
"y1": 391.3199768066406,
"width": 97.55999755859375,
"height": 20.519989013671875
}
]
},
{
"pageInfo": {
"number": 39,
"rotation": 0,
"width": 595.3200073242188,
"height": 842.0399780273438
},
"tableCells": [
{
"x0": 142.55999755859375,
"y0": 581.0399780273438,
"x1": 467.2799987792969,
"y1": 609.8399658203125,
"width": 324.7200012207031,
"height": 28.79998779296875
},
{
"x0": 142.55999755859375,
"y0": 568.7999877929688,
"x1": 467.2799987792969,
"y1": 579.5999755859375,
"width": 324.7200012207031,
"height": 10.79998779296875
}
]
}
]
}