RED-6009: Sort TextpositionSequences with tolerance
This commit is contained in:
parent
d0264248fb
commit
77a420c849
@ -95,7 +95,7 @@ public class DocumentGraphFactory {
|
||||
Rectangle2D position = image.getPosition();
|
||||
Page page = context.getPage(image.getPage());
|
||||
Image imageNode = Image.builder()
|
||||
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
||||
.id(IdBuilder.buildId(Set.of(page), List.of(position), image.getImageType().name(), "image"))
|
||||
.imageType(image.getImageType())
|
||||
.position(position)
|
||||
.transparent(image.isHasTransparency())
|
||||
|
||||
@ -139,7 +139,7 @@ public class RedactionEntity {
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
|
||||
|
||||
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
|
||||
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList(), type, entityType.name());
|
||||
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
|
||||
}
|
||||
return redactionPositionsPerPage;
|
||||
|
||||
@ -285,7 +285,7 @@ public class EntityCreationService {
|
||||
|
||||
RedactionEntity highlightEntity = RedactionEntity.initialEntityNode(new Boundary(tableCell.getBoundary().start(), tableCell.getBoundary().start()), type, entityType);
|
||||
|
||||
String positionId = IdBuilder.buildId(tableCell.getBBox().keySet(), tableCell.getBBox().values().stream().toList());
|
||||
String positionId = IdBuilder.buildId(tableCell.getBBox().keySet(), tableCell.getBBox().values().stream().toList(), type, entityType.name());
|
||||
highlightEntity.setRedactionPositionsPerPage(tableCell.getBBox()
|
||||
.entrySet()
|
||||
.stream()
|
||||
|
||||
@ -1,20 +1,27 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.util.QuickSort;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
|
||||
public class TextPositionOperations {
|
||||
|
||||
private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator();
|
||||
|
||||
|
||||
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) {
|
||||
|
||||
return textBlocks.stream()//
|
||||
.flatMap(tb -> tb.getSequences().stream())//
|
||||
.sorted(Comparator.comparingDouble(TextPositionSequence::getMaxYDirAdj)//
|
||||
.thenComparing(TextPositionSequence::getMaxXDirAdj))//
|
||||
.toList();
|
||||
var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||
|
||||
// because the TextPositionSequenceComparator is not transitive, but
|
||||
// JDK7+ enforces transitivity on comparators, we need to use
|
||||
// a custom quicksort implementation (which is slower, unfortunately).
|
||||
QuickSort.sort(sequence, comparator);
|
||||
return sequence;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
|
||||
/**
|
||||
* This class is a comparator for TextPosition operators. It handles
|
||||
* pages with text in different directions by grouping the text based
|
||||
* on direction and sorting in that direction. This allows continuous text
|
||||
* in a given direction to be more easily grouped together.
|
||||
*
|
||||
* @author Ben Litchfield
|
||||
*/
|
||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
|
||||
{
|
||||
@Override
|
||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
|
||||
{
|
||||
// only compare text that is in the same direction
|
||||
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||
if (cmp1 != 0)
|
||||
{
|
||||
return cmp1;
|
||||
}
|
||||
|
||||
// get the text direction adjusted coordinates
|
||||
float x1 = pos1.getMinXDirAdj();
|
||||
float x2 = pos2.getMinXDirAdj();
|
||||
|
||||
float pos1YBottom = pos1.getMaxYDirAdj();
|
||||
float pos2YBottom = pos2.getMaxYDirAdj();
|
||||
|
||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||
float pos1YTop = pos1YBottom - pos1.getTextHeight();
|
||||
float pos2YTop = pos2YBottom - pos2.getTextHeight();
|
||||
|
||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||
|
||||
// we will do a simple tolerance comparison
|
||||
if (yDifference < .1 ||
|
||||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
|
||||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
|
||||
{
|
||||
return Float.compare(x1, x2);
|
||||
}
|
||||
else if (pos1YBottom < pos2YBottom)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -19,15 +19,16 @@ public final class IdBuilder {
|
||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||
|
||||
|
||||
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine) {
|
||||
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
|
||||
|
||||
return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine);
|
||||
return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine, type, entityType);
|
||||
}
|
||||
|
||||
|
||||
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine) {
|
||||
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(type).append(entityType);
|
||||
List<Integer> sortedPageNumbers = pageNumbers.stream().sorted(Comparator.comparingInt(Integer::intValue)).toList();
|
||||
sortedPageNumbers.forEach(sb::append);
|
||||
rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX()))
|
||||
|
||||
@ -222,7 +222,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
|
||||
@Test
|
||||
public void titleExtraction() throws IOException {
|
||||
|
||||
AnalyzeRequest request = uploadFileToStorage("files/new/S157.pdf");
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
System.out.println("Start Full integration test");
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
System.out.println("Finished structure analysis");
|
||||
|
||||
@ -8622,4 +8622,5 @@ F. Lastname
|
||||
Mustermann
|
||||
Lastname
|
||||
Bojangles
|
||||
Tambourine Man
|
||||
Tambourine Man
|
||||
Tournayre J.C.
|
||||
@ -58,8 +58,8 @@ rule "SYN.0.0: Redact if CTL/* or BL/* was found (Non Vertebrate Study)"
|
||||
$section: Section(containsString("CTL/") || containsString("BL/"))
|
||||
then
|
||||
Stream.concat(
|
||||
entityCreationService.byString("CTL", "hint", EntityType.ENTITY, $section),
|
||||
entityCreationService.byString("BL", "hint", EntityType.ENTITY, $section)
|
||||
entityCreationService.byString("CTL", "must_redact", EntityType.ENTITY, $section),
|
||||
entityCreationService.byString("BL", "must_redact", EntityType.ENTITY, $section)
|
||||
).forEach(entity -> {
|
||||
entity.setRedactionReason("hint_only");
|
||||
entity.addMatchedRule(0);
|
||||
@ -1105,8 +1105,7 @@ rule "MAN.1.0: Apply id removals that are valid and not in forced redactions to
|
||||
not ManualForceRedaction($id == annotationId, status == AnnotationStatus.APPROVED, requestDate != null)
|
||||
$entityToBeRemoved: RedactionEntity(matchesAnnotationId($id))
|
||||
then
|
||||
$entityToBeRemoved.removeFromGraph();
|
||||
retract($entityToBeRemoved);
|
||||
$entityToBeRemoved.setIgnored(true);
|
||||
end
|
||||
|
||||
rule "MAN.1.1: Apply id removals that are valid and not in forced redactions to Image"
|
||||
@ -1224,10 +1223,10 @@ rule "X.5.0: remove Entity of type RECOMMENDATION when contained by ENTITY"
|
||||
|
||||
|
||||
// Rule unit: X.6
|
||||
rule "X.6.0: remove Entity of lower rank, when intersects"
|
||||
rule "X.6.0: remove Entity of lower rank, when intersected by entity of type ENTITY"
|
||||
salience 32
|
||||
when
|
||||
$higherRank: RedactionEntity($type: type)
|
||||
$higherRank: RedactionEntity($type: type, entityType == EntityType.ENTITY)
|
||||
$lowerRank: RedactionEntity(intersects($higherRank), type != $type, dictionary.getDictionaryRank(type) < dictionary.getDictionaryRank($type), !resized, !skipRemoveEntitiesContainedInLarger)
|
||||
then
|
||||
$lowerRank.removeFromGraph();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user