Merge branch 'RED-8701' into 'main'
RED-8701 - Move files to customer data repositories See merge request fforesight/layout-parser!137
This commit is contained in:
commit
84bdb4d1ed
@ -1,3 +1,7 @@
|
||||
variables:
|
||||
# SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF'
|
||||
GIT_SUBMODULE_STRATEGY: recursive
|
||||
GIT_SUBMODULE_FORCE_HTTPS: 'true'
|
||||
include:
|
||||
- project: 'gitlab/gitlab'
|
||||
ref: 'main'
|
||||
|
||||
8
.gitmodules
vendored
Normal file
8
.gitmodules
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
|
||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
|
||||
url = https://gitlab.knecon.com/fforesight/documents/basf.git
|
||||
update = merge
|
||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
|
||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
|
||||
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
|
||||
update = merge
|
||||
@ -69,11 +69,11 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
public void testHeadlineDetection() {
|
||||
|
||||
List<Metrics> metrics = new ArrayList<>();
|
||||
metrics.add(getMetrics("files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
|
||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
|
||||
"files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
|
||||
metrics.add(getMetrics("files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
|
||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
|
||||
"files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
|
||||
metrics.add(getMetrics("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
|
||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
|
||||
|
||||
double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0);
|
||||
double recall = metrics.stream().mapToDouble(Metrics::getRecall).average().orElse(1.0);
|
||||
|
||||
@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEnd_RED_8747() {
|
||||
|
||||
prepareStorage("files/SinglePages/MergedEntities.pdf");
|
||||
prepareStorage("files/syngenta/CustomerFiles/SinglePages/Page26_fRR A23317A PI0015600 CEU core part B6 - CZ.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
Arrays.stream(finishedEvent.message().split("\n"))
|
||||
|
||||
@ -48,7 +48,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
@Disabled
|
||||
public void writeJsonForFileTest() {
|
||||
|
||||
var resource = new ClassPathResource("files/1 Abamectin_prr.pdf");
|
||||
var resource = new ClassPathResource("files/syngenta/CustomerFiles/1 Abamectin_prr.pdf");
|
||||
writeJsons(resource.getFile().toPath());
|
||||
}
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@ public class DocumentGraphMappingTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testGraphMapping() {
|
||||
|
||||
String filename = "files/new/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf";
|
||||
String filename = "files/syngenta/CustomerFiles/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf";
|
||||
|
||||
Document document = buildGraph(filename);
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(document);
|
||||
|
||||
@ -28,7 +28,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentTest {
|
||||
// @Disabled
|
||||
public void visualizeMetolachlor() {
|
||||
|
||||
String filename = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf";
|
||||
String filename = "files/syngenta/CustomerFiles/Documine/Flora/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||
visualizePdf(filename);
|
||||
}
|
||||
|
||||
@ -48,7 +48,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentTest {
|
||||
@Disabled
|
||||
public void visualizeCraftedDocument() {
|
||||
|
||||
String filename = "files/1 Abamectin_prr.pdf";
|
||||
String filename = "files/syngenta/CustomerFiles/1 Abamectin_prr.pdf";
|
||||
visualizePdf(filename);
|
||||
}
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
@ -81,7 +81,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void tablesToHtmlDebugger() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -149,7 +149,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testPDFSegmentationWithComplexTable() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
@ -163,61 +163,130 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testTableExtraction() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testMultiPageMetadataPropagation() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testHeaderCellsForRotatedTable() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc56Page170() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page170_56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -251,7 +320,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testVV931175Page1() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page1_VV-931175.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -292,7 +361,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc27Page6() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/syngenta/CustomerFiles/SinglePages/Page6_27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -312,7 +382,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Disabled // FIXME Fake Redactions leads to more cells, no solution for this currently
|
||||
public void testDocA20622APartB9Page185() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page185_A20622A izRMS (CZ) fRR Part B9.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -325,7 +395,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDocA20622APartB9Page185FixedDoc() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page185_fixed_A20622A izRMS (CZ) fRR Part B9.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -338,7 +408,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDocA20622APartB7Page123() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page123_A20622A izZRMS (CZ) fRR Part B7.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -357,7 +427,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc77Page111() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page11_77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -373,7 +443,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc95Page532() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page532_95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -386,7 +456,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc52Page175() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page175_52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -400,7 +470,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc52Page174() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page174_52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -413,7 +483,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc19Page35() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/syngenta/CustomerFiles/SinglePages/Page35_19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -426,7 +497,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc19Page161() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/syngenta/CustomerFiles/SinglePages/Page161_19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -441,7 +513,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
public void testDoc47Page30() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf");
|
||||
"files/syngenta/CustomerFiles/SinglePages/Page30_47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -457,7 +529,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
public void testDoc49Page61() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf");
|
||||
"files/syngenta/CustomerFiles/SinglePages/Page61_49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -472,7 +544,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc81Page54() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/syngenta/CustomerFiles/SinglePages/Page54_81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -487,7 +560,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc88Page134() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page134_85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -502,7 +575,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDocThiabendazolePage18() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page18_Thiabendazole DAR Addendum for ED_April_2020.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -519,7 +592,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc15Page18() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat_Page18.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/syngenta/CustomerFiles/SinglePages/Page18_15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -534,7 +608,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
public void testDoc28Page23() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(
|
||||
"files/SinglePages/28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product_Page23.pdf");
|
||||
"files/syngenta/CustomerFiles/SinglePages/Page23_28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -549,7 +623,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc24Page17() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page17_24 - SYN549522 - Acute Oral Toxicity - Rats.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -563,7 +637,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testDoc30Page5() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/30 - Dicamba - Acute Oral Toxicity - Rats_Page5.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Page5_30 - Dicamba - Acute Oral Toxicity - Rats.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -626,7 +700,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testT3() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T3 S-Meto_Page29.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/T3_Page29_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -640,7 +714,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testT4() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T4 138 IDD0000261736_Page16.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -654,7 +728,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testT5() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
@ -670,7 +744,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void testMergedEntities_Page26() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/MergedEntities.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Page26_fRR A23317A PI0015600 CEU core part B6 - CZ.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
|
||||
@ -26,7 +26,7 @@ class GapAcrossLinesDetectionServiceTest {
|
||||
@SneakyThrows
|
||||
public void testGapBasedColumnDetection() {
|
||||
|
||||
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
@ -52,7 +52,7 @@ class GapAcrossLinesDetectionServiceTest {
|
||||
@SneakyThrows
|
||||
public void testColumnDetection() {
|
||||
|
||||
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
@ -26,7 +26,7 @@ class InvisibleTableDetectionServiceTest {
|
||||
@SneakyThrows
|
||||
public void detectInvisibleTableTest() {
|
||||
|
||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ class MainBodyTextFrameExtractionServiceTest {
|
||||
@SneakyThrows
|
||||
public void testMainBodyDetection() {
|
||||
|
||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
|
||||
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getSortedPageContents(fileName);
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@ class PageContentExtractorTest {
|
||||
@SneakyThrows
|
||||
public void testTextPositionSequenceExtraction() {
|
||||
|
||||
String fileName = "files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf";
|
||||
String fileName = "files/syngenta/CustomerFiles/Documine/Flora/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
|
||||
@ -20,7 +20,7 @@ class PageInformationServiceTest {
|
||||
@SneakyThrows
|
||||
public void testGapDetection() {
|
||||
|
||||
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
@ -43,7 +43,7 @@ class PageInformationServiceTest {
|
||||
@SneakyThrows
|
||||
public void testLineDetection() {
|
||||
|
||||
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
@ -46,7 +46,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void textRectanglesFromRulingsExtraction() {
|
||||
|
||||
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf";
|
||||
String fileName = "files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_CELLS.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
@ -66,7 +66,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void textRulingExtraction() {
|
||||
|
||||
String fileName = "files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf";
|
||||
String fileName = "files/syngenta/CustomerFiles/SinglePages/Page35_19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
@ -46,7 +46,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
|
||||
|
||||
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
|
||||
if (filename.equals("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
|
||||
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
|
||||
} else {
|
||||
prepareStorage(filename);
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1 @@
|
||||
Subproject commit 9dc6c2337dea32e63aef53271dba0692537c6605
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user