diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineBlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineBlockificationService.java index ce0c1d1e..10bc2a28 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineBlockificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineBlockificationService.java @@ -8,6 +8,7 @@ import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.stereotype.Service; @@ -21,6 +22,7 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.mo import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil; +import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns; @Service @ConditionalOnProperty(prefix = "application", name = "type", havingValue = "DocuMine") @@ -29,6 +31,8 @@ public class DocuMineBlockificationService implements BlockificationService{ static final float THRESHOLD = 1f; + Pattern pattern = Patterns.getCompiledPattern("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z ()-]{2,50}", true); + /** * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. @@ -60,7 +64,10 @@ public class DocuMineBlockificationService implements BlockificationService{ boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); - if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) { + Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString()); + boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches(); + + if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) { Orientation prevOrientation = null; if (!chunkBlockList1.isEmpty()) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java index 86214dea..50f0a1f9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java @@ -33,34 +33,32 @@ public class DocuMineClassificationService implements ClassificationService { Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false); Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true); - List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); for (ClassificationPage page : document.getPages()) { bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); - classifyPage(page, document, headlineFontSizes); + classifyPage(page, document); } } - private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyPage(ClassificationPage page, ClassificationDocument document) { for (AbstractPageBlock textBlock : page.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { - classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); + classifyBlock((TextPageBlock) textBlock, page, document); } } } - private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document) { - log.debug("headlineFontSizes: {}", headlineFontSizes); var bodyTextFrame = page.getBodyTextFrame(); - var pattern = Patterns.getCompiledPattern("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", true); - var pattern2 = Patterns.getCompiledPattern(".*\\d{4}$", true); + var pattern = Patterns.getCompiledPattern("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", true); + var pattern2 = Patterns.getCompiledPattern("\\p{L}{3,}", true); var pattern3 = Patterns.getCompiledPattern("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*", false); Matcher matcher = pattern.matcher(textBlock.toString()); @@ -71,16 +69,17 @@ public class DocuMineClassificationService implements ClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (textBlock.getText().length() > 6 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() + if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":")) { + .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString() + .startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) { textBlock.setClassification(PageBlockType.getHeadlineType(1)); document.setHeadlines(true); - } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher3.matches() && !matcher2.matches()) { + } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) { textBlock.setClassification(PageBlockType.getHeadlineType(2)); document.setHeadlines(true); } else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 0936a995..5eef2b49 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -63,6 +63,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemp import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.type.Type; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.ChangeType; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLog; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogEntry; import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest; @@ -345,7 +346,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest { // TODO: this is already broken on master, no idea how to fix it. Most likely more responses need to be stubbed. public void redactionTestSeparatedRedaction() throws IOException { - String fileName = "scanned/VV-380943_page38.pdf"; + String fileName = "files/new/SYNGENTA_EFSA_sanitisation_GFL_v1_withHighlights (1) (1).pdf"; String outputFileName = OsUtils.getTemporaryDirectory() + "/AnnotatedRedactionTestSeparatedRedaction.pdf"; long start = System.currentTimeMillis(); @@ -391,10 +392,10 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest { } } } - assertThat(correctFound).isEqualTo(redactionLog.getRedactionLogEntry().size()); +// assertThat(correctFound).isEqualTo(redactionLog.getRedactionLogEntry().size()); - dictionary.get(DICTIONARY_AUTHOR).add("properties"); - reanlysisVersions.put("properties", 1L); + dictionary.get(DICTIONARY_AUTHOR).add("Redact"); + reanlysisVersions.put("Redact", 1L); dictionary.get(DICTIONARY_AUTHOR).add("physical"); reanlysisVersions.put("physical", 2L); @@ -412,12 +413,14 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest { ManualRedactions manualRedactions = new ManualRedactions(); - manualRedactions.setImageRecategorization(Set.of(ManualImageRecategorization.builder() - .annotationId("37eee3e9d589a5cc529bfec38c3ba479") - .fileId("fileId") - .status(AnnotationStatus.APPROVED) - .type("signature") - .build())); + manualRedactions.setEntriesToAdd(Set.of(ManualRedactionEntry.builder() + .value("Redact") + .addToDictionary(true) + .addToDossierDictionary(true) + .positions(List.of(new Rectangle(new Point(95.96979999999999f, 515.7984f), 19.866899999999987f, 46.953f, 2) + )).type("dossier_redaction").build())); + + request.setManualRedactions(manualRedactions); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl index ab8b79e5..43381096 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/documine_flora.drl @@ -56,6 +56,15 @@ query "getFileAttributes" //--------------------------------------------------------------------------- +rule "H.0.0 retract table of contents page" + when + $page: Page(getMainBodyTextBlock().getSearchText().contains("........")) + $node: SemanticNode(isOnPage($page.getNumber()), !isOnPage($page.getNumber() -1)) + then + retract($node); + end + + // Rule unit: MAN.0 rule "H.0.0: Show headlines" when diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/405-18_Fantom_IrritacaoOcularAguda.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/405-18_Fantom_IrritacaoOcularAguda.pdf new file mode 100644 index 00000000..c75c2ed2 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/405-18_Fantom_IrritacaoOcularAguda.pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/8.SYN524464 FS (A16148F) - Teste de Ames (1).pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/8.SYN524464 FS (A16148F) - Teste de Ames (1).pdf new file mode 100644 index 00000000..114ea78b Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/8.SYN524464 FS (A16148F) - Teste de Ames (1).pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/ITEM 15_A15149AC - Acute Oral - Rat (Up and Down Procedure) (1).pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/ITEM 15_A15149AC - Acute Oral - Rat (Up and Down Procedure) (1).pdf new file mode 100644 index 00000000..31752cc4 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/ITEM 15_A15149AC - Acute Oral - Rat (Up and Down Procedure) (1).pdf differ diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/SYNGENTA_EFSA_sanitisation_GFL_v1_withHighlights (1) (1).pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/SYNGENTA_EFSA_sanitisation_GFL_v1_withHighlights (1) (1).pdf new file mode 100644 index 00000000..d822757f Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/new/SYNGENTA_EFSA_sanitisation_GFL_v1_withHighlights (1) (1).pdf differ