From 77a420c8492c9b1a7d43127ddda1d4cc8eac58d2 Mon Sep 17 00:00:00 2001 From: deiflaender Date: Mon, 19 Jun 2023 11:03:27 +0200 Subject: [PATCH] RED-6009: Sort TextpositionSequences with tolerance --- .../factory/DocumentGraphFactory.java | 2 +- .../graph/entity/RedactionEntity.java | 2 +- .../services/EntityCreationService.java | 2 +- .../utils/TextPositionOperations.java | 19 +++-- .../utils/TextPositionSequenceComparator.java | 74 +++++++++++++++++++ .../v1/server/redaction/utils/IdBuilder.java | 7 +- .../v1/server/RedactionIntegrationTest.java | 2 +- .../resources/dictionaries/CBI_author.txt | 3 +- .../src/test/resources/drools/rules.drl | 11 ++- 9 files changed, 102 insertions(+), 20 deletions(-) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TextPositionSequenceComparator.java diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/DocumentGraphFactory.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/DocumentGraphFactory.java index d564e83a..1b64dee2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/DocumentGraphFactory.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/DocumentGraphFactory.java @@ -95,7 +95,7 @@ public class DocumentGraphFactory { Rectangle2D position = image.getPosition(); Page page = context.getPage(image.getPage()); Image imageNode = Image.builder() - .id(IdBuilder.buildId(Set.of(page), List.of(position))) + .id(IdBuilder.buildId(Set.of(page), List.of(position), image.getImageType().name(), "image")) .imageType(image.getImageType()) .position(position) .transparent(image.isHasTransparency()) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/entity/RedactionEntity.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/entity/RedactionEntity.java index fb6ee428..10de154a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/entity/RedactionEntity.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/entity/RedactionEntity.java @@ -139,7 +139,7 @@ public class RedactionEntity { .min(Comparator.comparingInt(Page::getNumber)) .orElseThrow(() -> new RuntimeException("No Positions found on any page!")); - String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList()); + String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList(), type, entityType.name()); redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList(); } return redactionPositionsPerPage; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java index c37bb0a2..d381ae10 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java @@ -285,7 +285,7 @@ public class EntityCreationService { RedactionEntity highlightEntity = RedactionEntity.initialEntityNode(new Boundary(tableCell.getBoundary().start(), tableCell.getBoundary().start()), type, entityType); - String positionId = IdBuilder.buildId(tableCell.getBBox().keySet(), tableCell.getBBox().values().stream().toList()); + String positionId = IdBuilder.buildId(tableCell.getBBox().keySet(), tableCell.getBBox().values().stream().toList(), type, entityType.name()); highlightEntity.setRedactionPositionsPerPage(tableCell.getBBox() .entrySet() .stream() diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TextPositionOperations.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TextPositionOperations.java index 04318811..c2687ea2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TextPositionOperations.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TextPositionOperations.java @@ -1,20 +1,27 @@ package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils; -import java.util.Comparator; import java.util.List; +import java.util.stream.Collectors; + +import org.apache.pdfbox.util.QuickSort; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; public class TextPositionOperations { + private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator(); + + public static List mergeAndSortTextPositionSequenceByYThenX(List textBlocks) { - return textBlocks.stream()// - .flatMap(tb -> tb.getSequences().stream())// - .sorted(Comparator.comparingDouble(TextPositionSequence::getMaxYDirAdj)// - .thenComparing(TextPositionSequence::getMaxXDirAdj))// - .toList(); + var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList()); + + // because the TextPositionSequenceComparator is not transitive, but + // JDK7+ enforces transitivity on comparators, we need to use + // a custom quicksort implementation (which is slower, unfortunately). + QuickSort.sort(sequence, comparator); + return sequence; } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TextPositionSequenceComparator.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TextPositionSequenceComparator.java new file mode 100644 index 00000000..8e171f5c --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TextPositionSequenceComparator.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils; + +import java.util.Comparator; + +import org.apache.pdfbox.text.TextPosition; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; + +/** + * This class is a comparator for TextPosition operators. It handles + * pages with text in different directions by grouping the text based + * on direction and sorting in that direction. This allows continuous text + * in a given direction to be more easily grouped together. + * + * @author Ben Litchfield + */ +public class TextPositionSequenceComparator implements Comparator +{ + @Override + public int compare(TextPositionSequence pos1, TextPositionSequence pos2) + { + // only compare text that is in the same direction + int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees()); + if (cmp1 != 0) + { + return cmp1; + } + + // get the text direction adjusted coordinates + float x1 = pos1.getMinXDirAdj(); + float x2 = pos2.getMinXDirAdj(); + + float pos1YBottom = pos1.getMaxYDirAdj(); + float pos2YBottom = pos2.getMaxYDirAdj(); + + // note that the coordinates have been adjusted so 0,0 is in upper left + float pos1YTop = pos1YBottom - pos1.getTextHeight(); + float pos2YTop = pos2YBottom - pos2.getTextHeight(); + + float yDifference = Math.abs(pos1YBottom - pos2YBottom); + + // we will do a simple tolerance comparison + if (yDifference < .1 || + pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || + pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) + { + return Float.compare(x1, x2); + } + else if (pos1YBottom < pos2YBottom) + { + return -1; + } + else + { + return 1; + } + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java index bb9a87d8..d1a3da42 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/IdBuilder.java @@ -19,15 +19,16 @@ public final class IdBuilder { private final HashFunction hashFunction = Hashing.murmur3_128(); - public String buildId(Set pages, List rectanglesPerLine) { + public String buildId(Set pages, List rectanglesPerLine, String type, String entityType) { - return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine); + return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine, type, entityType); } - public String buildId(List pageNumbers, List rectanglesPerLine) { + public String buildId(List pageNumbers, List rectanglesPerLine, String type, String entityType) { StringBuilder sb = new StringBuilder(); + sb.append(type).append(entityType); List sortedPageNumbers = pageNumbers.stream().sorted(Comparator.comparingInt(Integer::intValue)).toList(); sortedPageNumbers.forEach(sb::append); rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX())) diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 65112ab7..9d388fb6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -222,7 +222,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest { @Test public void titleExtraction() throws IOException { - AnalyzeRequest request = uploadFileToStorage("files/new/S157.pdf"); + AnalyzeRequest request = uploadFileToStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); System.out.println("Start Full integration test"); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); System.out.println("Finished structure analysis"); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt index 344db2ca..9b21c07e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_author.txt @@ -8622,4 +8622,5 @@ F. Lastname Mustermann Lastname Bojangles -Tambourine Man \ No newline at end of file +Tambourine Man +Tournayre J.C. \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index 74cb4267..0b3359b9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -58,8 +58,8 @@ rule "SYN.0.0: Redact if CTL/* or BL/* was found (Non Vertebrate Study)" $section: Section(containsString("CTL/") || containsString("BL/")) then Stream.concat( - entityCreationService.byString("CTL", "hint", EntityType.ENTITY, $section), - entityCreationService.byString("BL", "hint", EntityType.ENTITY, $section) + entityCreationService.byString("CTL", "must_redact", EntityType.ENTITY, $section), + entityCreationService.byString("BL", "must_redact", EntityType.ENTITY, $section) ).forEach(entity -> { entity.setRedactionReason("hint_only"); entity.addMatchedRule(0); @@ -1105,8 +1105,7 @@ rule "MAN.1.0: Apply id removals that are valid and not in forced redactions to not ManualForceRedaction($id == annotationId, status == AnnotationStatus.APPROVED, requestDate != null) $entityToBeRemoved: RedactionEntity(matchesAnnotationId($id)) then - $entityToBeRemoved.removeFromGraph(); - retract($entityToBeRemoved); + $entityToBeRemoved.setIgnored(true); end rule "MAN.1.1: Apply id removals that are valid and not in forced redactions to Image" @@ -1224,10 +1223,10 @@ rule "X.5.0: remove Entity of type RECOMMENDATION when contained by ENTITY" // Rule unit: X.6 -rule "X.6.0: remove Entity of lower rank, when intersects" +rule "X.6.0: remove Entity of lower rank, when intersected by entity of type ENTITY" salience 32 when - $higherRank: RedactionEntity($type: type) + $higherRank: RedactionEntity($type: type, entityType == EntityType.ENTITY) $lowerRank: RedactionEntity(intersects($higherRank), type != $type, dictionary.getDictionaryRank(type) < dictionary.getDictionaryRank($type), !resized, !skipRemoveEntitiesContainedInLarger) then $lowerRank.removeFromGraph();