RED-5276: Fixed strange behavior of text parsing for tables example document

This commit is contained in:
deiflaender 2023-01-31 10:59:46 +01:00
parent c16b6d41d5
commit 16b04b5918
6 changed files with 12 additions and 3 deletions

View File

@ -22,6 +22,7 @@ import org.apache.pdfbox.text.TextPosition;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
@Slf4j
@ -200,6 +201,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
int startIndex = 0;
RedTextPosition previous = null;
textPositions.sort(Comparator.comparing(TextPosition::getXDirAdj));
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (!textPositionSequences.isEmpty()) {

View File

@ -13,7 +13,6 @@ public class FileUtils {
public File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
System.out.println(filenamePrefix + " " + filenameSuffix);
File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile();
setRWPermissionsOnlyForOwner(tempFile);

View File

@ -364,7 +364,7 @@ public class RedactionIntegrationTest {
@Test
public void titleExtraction() throws IOException {
AnalyzeRequest request = prepareStorage("files/RSS/06 - Isopyrazam - Acute Oral Toxicity Rat.pdf");
AnalyzeRequest request = prepareStorage("files/new/table-with-merged-cells.pdf");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
@ -1174,7 +1174,7 @@ public class RedactionIntegrationTest {
public void htmlTablesTest() throws IOException {
System.out.println("htmlTablesTest");
AnalyzeRequest request = prepareStorage("files/Minimal Examples/Single Table.pdf");
AnalyzeRequest request = prepareStorage("files/new/table-with-merged-cells.pdf");
RedactionRequest redactionRequest = RedactionRequest.builder()
.dossierId(request.getDossierId())

View File

@ -397,4 +397,11 @@ rule "102: Guidelines FileAttributes"
Section((text.contains("DATA REQUIREMENT(S):") || text.contains("TEST GUIDELINE(S):")) && (text.contains("OECD") || text.contains("EPA") || text.contains("OPPTS")))
then
section.addFileAttribute("OECD Number", "OECD (No\\.? )?\\d{3}( \\(\\d{4}\\))?", false, 0);
end
rule "8: Redact Author cells in Tables with Author header (Non vertebrate study)"
when
Section(hasTableHeader("h5.1"))
then
section.redactCell("h5.1", 8, "CBI_author", false, "Author found", "Article 39(e)(3) of Regulation (EC) No 178/2002");
end