RED-6009: Sort TextpositionSequences with tolerance

This commit is contained in:
deiflaender 2023-06-19 11:03:27 +02:00
parent d0264248fb
commit 77a420c849
9 changed files with 102 additions and 20 deletions

View File

@ -95,7 +95,7 @@ public class DocumentGraphFactory {
Rectangle2D position = image.getPosition();
Page page = context.getPage(image.getPage());
Image imageNode = Image.builder()
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
.id(IdBuilder.buildId(Set.of(page), List.of(position), image.getImageType().name(), "image"))
.imageType(image.getImageType())
.position(position)
.transparent(image.isHasTransparency())

View File

@ -139,7 +139,7 @@ public class RedactionEntity {
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList(), type, entityType.name());
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
}
return redactionPositionsPerPage;

View File

@ -285,7 +285,7 @@ public class EntityCreationService {
RedactionEntity highlightEntity = RedactionEntity.initialEntityNode(new Boundary(tableCell.getBoundary().start(), tableCell.getBoundary().start()), type, entityType);
String positionId = IdBuilder.buildId(tableCell.getBBox().keySet(), tableCell.getBBox().values().stream().toList());
String positionId = IdBuilder.buildId(tableCell.getBBox().keySet(), tableCell.getBBox().values().stream().toList(), type, entityType.name());
highlightEntity.setRedactionPositionsPerPage(tableCell.getBBox()
.entrySet()
.stream()

View File

@ -1,20 +1,27 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.util.QuickSort;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
public class TextPositionOperations {
private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator();
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) {
return textBlocks.stream()//
.flatMap(tb -> tb.getSequences().stream())//
.sorted(Comparator.comparingDouble(TextPositionSequence::getMaxYDirAdj)//
.thenComparing(TextPositionSequence::getMaxXDirAdj))//
.toList();
var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
// because the TextPositionSequenceComparator is not transitive, but
// JDK7+ enforces transitivity on comparators, we need to use
// a custom quicksort implementation (which is slower, unfortunately).
QuickSort.sort(sequence, comparator);
return sequence;
}
}

View File

@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils;
import java.util.Comparator;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
/**
* This class is a comparator for TextPosition operators. It handles
* pages with text in different directions by grouping the text based
* on direction and sorting in that direction. This allows continuous text
* in a given direction to be more easily grouped together.
*
* @author Ben Litchfield
*/
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
{
@Override
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
{
// only compare text that is in the same direction
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
if (cmp1 != 0)
{
return cmp1;
}
// get the text direction adjusted coordinates
float x1 = pos1.getMinXDirAdj();
float x2 = pos2.getMinXDirAdj();
float pos1YBottom = pos1.getMaxYDirAdj();
float pos2YBottom = pos2.getMaxYDirAdj();
// note that the coordinates have been adjusted so 0,0 is in upper left
float pos1YTop = pos1YBottom - pos1.getTextHeight();
float pos2YTop = pos2YBottom - pos2.getTextHeight();
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison
if (yDifference < .1 ||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
{
return Float.compare(x1, x2);
}
else if (pos1YBottom < pos2YBottom)
{
return -1;
}
else
{
return 1;
}
}
}

View File

@ -19,15 +19,16 @@ public final class IdBuilder {
private final HashFunction hashFunction = Hashing.murmur3_128();
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine) {
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine);
return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine, type, entityType);
}
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine) {
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
StringBuilder sb = new StringBuilder();
sb.append(type).append(entityType);
List<Integer> sortedPageNumbers = pageNumbers.stream().sorted(Comparator.comparingInt(Integer::intValue)).toList();
sortedPageNumbers.forEach(sb::append);
rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX()))

View File

@ -222,7 +222,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
@Test
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/new/S157.pdf");
AnalyzeRequest request = uploadFileToStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
System.out.println("Start Full integration test");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
System.out.println("Finished structure analysis");

View File

@ -8622,4 +8622,5 @@ F. Lastname
Mustermann
Lastname
Bojangles
Tambourine Man
Tambourine Man
Tournayre J.C.

View File

@ -58,8 +58,8 @@ rule "SYN.0.0: Redact if CTL/* or BL/* was found (Non Vertebrate Study)"
$section: Section(containsString("CTL/") || containsString("BL/"))
then
Stream.concat(
entityCreationService.byString("CTL", "hint", EntityType.ENTITY, $section),
entityCreationService.byString("BL", "hint", EntityType.ENTITY, $section)
entityCreationService.byString("CTL", "must_redact", EntityType.ENTITY, $section),
entityCreationService.byString("BL", "must_redact", EntityType.ENTITY, $section)
).forEach(entity -> {
entity.setRedactionReason("hint_only");
entity.addMatchedRule(0);
@ -1105,8 +1105,7 @@ rule "MAN.1.0: Apply id removals that are valid and not in forced redactions to
not ManualForceRedaction($id == annotationId, status == AnnotationStatus.APPROVED, requestDate != null)
$entityToBeRemoved: RedactionEntity(matchesAnnotationId($id))
then
$entityToBeRemoved.removeFromGraph();
retract($entityToBeRemoved);
$entityToBeRemoved.setIgnored(true);
end
rule "MAN.1.1: Apply id removals that are valid and not in forced redactions to Image"
@ -1224,10 +1223,10 @@ rule "X.5.0: remove Entity of type RECOMMENDATION when contained by ENTITY"
// Rule unit: X.6
rule "X.6.0: remove Entity of lower rank, when intersects"
rule "X.6.0: remove Entity of lower rank, when intersected by entity of type ENTITY"
salience 32
when
$higherRank: RedactionEntity($type: type)
$higherRank: RedactionEntity($type: type, entityType == EntityType.ENTITY)
$lowerRank: RedactionEntity(intersects($higherRank), type != $type, dictionary.getDictionaryRank(type) < dictionary.getDictionaryRank($type), !resized, !skipRemoveEntitiesContainedInLarger)
then
$lowerRank.removeFromGraph();