RED-6009: Sort TextpositionSequences with tolerance
This commit is contained in:
parent
d0264248fb
commit
77a420c849
@ -95,7 +95,7 @@ public class DocumentGraphFactory {
|
|||||||
Rectangle2D position = image.getPosition();
|
Rectangle2D position = image.getPosition();
|
||||||
Page page = context.getPage(image.getPage());
|
Page page = context.getPage(image.getPage());
|
||||||
Image imageNode = Image.builder()
|
Image imageNode = Image.builder()
|
||||||
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
.id(IdBuilder.buildId(Set.of(page), List.of(position), image.getImageType().name(), "image"))
|
||||||
.imageType(image.getImageType())
|
.imageType(image.getImageType())
|
||||||
.position(position)
|
.position(position)
|
||||||
.transparent(image.isHasTransparency())
|
.transparent(image.isHasTransparency())
|
||||||
|
|||||||
@ -139,7 +139,7 @@ public class RedactionEntity {
|
|||||||
.min(Comparator.comparingInt(Page::getNumber))
|
.min(Comparator.comparingInt(Page::getNumber))
|
||||||
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
|
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
|
||||||
|
|
||||||
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
|
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList(), type, entityType.name());
|
||||||
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
|
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
|
||||||
}
|
}
|
||||||
return redactionPositionsPerPage;
|
return redactionPositionsPerPage;
|
||||||
|
|||||||
@ -285,7 +285,7 @@ public class EntityCreationService {
|
|||||||
|
|
||||||
RedactionEntity highlightEntity = RedactionEntity.initialEntityNode(new Boundary(tableCell.getBoundary().start(), tableCell.getBoundary().start()), type, entityType);
|
RedactionEntity highlightEntity = RedactionEntity.initialEntityNode(new Boundary(tableCell.getBoundary().start(), tableCell.getBoundary().start()), type, entityType);
|
||||||
|
|
||||||
String positionId = IdBuilder.buildId(tableCell.getBBox().keySet(), tableCell.getBBox().values().stream().toList());
|
String positionId = IdBuilder.buildId(tableCell.getBBox().keySet(), tableCell.getBBox().values().stream().toList(), type, entityType.name());
|
||||||
highlightEntity.setRedactionPositionsPerPage(tableCell.getBBox()
|
highlightEntity.setRedactionPositionsPerPage(tableCell.getBBox()
|
||||||
.entrySet()
|
.entrySet()
|
||||||
.stream()
|
.stream()
|
||||||
|
|||||||
@ -1,20 +1,27 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils;
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils;
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.util.QuickSort;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
|
|
||||||
public class TextPositionOperations {
|
public class TextPositionOperations {
|
||||||
|
|
||||||
|
private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator();
|
||||||
|
|
||||||
|
|
||||||
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) {
|
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) {
|
||||||
|
|
||||||
return textBlocks.stream()//
|
var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||||
.flatMap(tb -> tb.getSequences().stream())//
|
|
||||||
.sorted(Comparator.comparingDouble(TextPositionSequence::getMaxYDirAdj)//
|
// because the TextPositionSequenceComparator is not transitive, but
|
||||||
.thenComparing(TextPositionSequence::getMaxXDirAdj))//
|
// JDK7+ enforces transitivity on comparators, we need to use
|
||||||
.toList();
|
// a custom quicksort implementation (which is slower, unfortunately).
|
||||||
|
QuickSort.sort(sequence, comparator);
|
||||||
|
return sequence;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,74 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is a comparator for TextPosition operators. It handles
|
||||||
|
* pages with text in different directions by grouping the text based
|
||||||
|
* on direction and sorting in that direction. This allows continuous text
|
||||||
|
* in a given direction to be more easily grouped together.
|
||||||
|
*
|
||||||
|
* @author Ben Litchfield
|
||||||
|
*/
|
||||||
|
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
|
||||||
|
{
|
||||||
|
@Override
|
||||||
|
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
|
||||||
|
{
|
||||||
|
// only compare text that is in the same direction
|
||||||
|
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||||
|
if (cmp1 != 0)
|
||||||
|
{
|
||||||
|
return cmp1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the text direction adjusted coordinates
|
||||||
|
float x1 = pos1.getMinXDirAdj();
|
||||||
|
float x2 = pos2.getMinXDirAdj();
|
||||||
|
|
||||||
|
float pos1YBottom = pos1.getMaxYDirAdj();
|
||||||
|
float pos2YBottom = pos2.getMaxYDirAdj();
|
||||||
|
|
||||||
|
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||||
|
float pos1YTop = pos1YBottom - pos1.getTextHeight();
|
||||||
|
float pos2YTop = pos2YBottom - pos2.getTextHeight();
|
||||||
|
|
||||||
|
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||||
|
|
||||||
|
// we will do a simple tolerance comparison
|
||||||
|
if (yDifference < .1 ||
|
||||||
|
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
|
||||||
|
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
|
||||||
|
{
|
||||||
|
return Float.compare(x1, x2);
|
||||||
|
}
|
||||||
|
else if (pos1YBottom < pos2YBottom)
|
||||||
|
{
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -19,15 +19,16 @@ public final class IdBuilder {
|
|||||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||||
|
|
||||||
|
|
||||||
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine) {
|
public String buildId(Set<Page> pages, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
|
||||||
|
|
||||||
return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine);
|
return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine, type, entityType);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine) {
|
public String buildId(List<Integer> pageNumbers, List<Rectangle2D> rectanglesPerLine, String type, String entityType) {
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append(type).append(entityType);
|
||||||
List<Integer> sortedPageNumbers = pageNumbers.stream().sorted(Comparator.comparingInt(Integer::intValue)).toList();
|
List<Integer> sortedPageNumbers = pageNumbers.stream().sorted(Comparator.comparingInt(Integer::intValue)).toList();
|
||||||
sortedPageNumbers.forEach(sb::append);
|
sortedPageNumbers.forEach(sb::append);
|
||||||
rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX()))
|
rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX()))
|
||||||
|
|||||||
@ -222,7 +222,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
|
|||||||
@Test
|
@Test
|
||||||
public void titleExtraction() throws IOException {
|
public void titleExtraction() throws IOException {
|
||||||
|
|
||||||
AnalyzeRequest request = uploadFileToStorage("files/new/S157.pdf");
|
AnalyzeRequest request = uploadFileToStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||||
System.out.println("Start Full integration test");
|
System.out.println("Start Full integration test");
|
||||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||||
System.out.println("Finished structure analysis");
|
System.out.println("Finished structure analysis");
|
||||||
|
|||||||
@ -8622,4 +8622,5 @@ F. Lastname
|
|||||||
Mustermann
|
Mustermann
|
||||||
Lastname
|
Lastname
|
||||||
Bojangles
|
Bojangles
|
||||||
Tambourine Man
|
Tambourine Man
|
||||||
|
Tournayre J.C.
|
||||||
@ -58,8 +58,8 @@ rule "SYN.0.0: Redact if CTL/* or BL/* was found (Non Vertebrate Study)"
|
|||||||
$section: Section(containsString("CTL/") || containsString("BL/"))
|
$section: Section(containsString("CTL/") || containsString("BL/"))
|
||||||
then
|
then
|
||||||
Stream.concat(
|
Stream.concat(
|
||||||
entityCreationService.byString("CTL", "hint", EntityType.ENTITY, $section),
|
entityCreationService.byString("CTL", "must_redact", EntityType.ENTITY, $section),
|
||||||
entityCreationService.byString("BL", "hint", EntityType.ENTITY, $section)
|
entityCreationService.byString("BL", "must_redact", EntityType.ENTITY, $section)
|
||||||
).forEach(entity -> {
|
).forEach(entity -> {
|
||||||
entity.setRedactionReason("hint_only");
|
entity.setRedactionReason("hint_only");
|
||||||
entity.addMatchedRule(0);
|
entity.addMatchedRule(0);
|
||||||
@ -1105,8 +1105,7 @@ rule "MAN.1.0: Apply id removals that are valid and not in forced redactions to
|
|||||||
not ManualForceRedaction($id == annotationId, status == AnnotationStatus.APPROVED, requestDate != null)
|
not ManualForceRedaction($id == annotationId, status == AnnotationStatus.APPROVED, requestDate != null)
|
||||||
$entityToBeRemoved: RedactionEntity(matchesAnnotationId($id))
|
$entityToBeRemoved: RedactionEntity(matchesAnnotationId($id))
|
||||||
then
|
then
|
||||||
$entityToBeRemoved.removeFromGraph();
|
$entityToBeRemoved.setIgnored(true);
|
||||||
retract($entityToBeRemoved);
|
|
||||||
end
|
end
|
||||||
|
|
||||||
rule "MAN.1.1: Apply id removals that are valid and not in forced redactions to Image"
|
rule "MAN.1.1: Apply id removals that are valid and not in forced redactions to Image"
|
||||||
@ -1224,10 +1223,10 @@ rule "X.5.0: remove Entity of type RECOMMENDATION when contained by ENTITY"
|
|||||||
|
|
||||||
|
|
||||||
// Rule unit: X.6
|
// Rule unit: X.6
|
||||||
rule "X.6.0: remove Entity of lower rank, when intersects"
|
rule "X.6.0: remove Entity of lower rank, when intersected by entity of type ENTITY"
|
||||||
salience 32
|
salience 32
|
||||||
when
|
when
|
||||||
$higherRank: RedactionEntity($type: type)
|
$higherRank: RedactionEntity($type: type, entityType == EntityType.ENTITY)
|
||||||
$lowerRank: RedactionEntity(intersects($higherRank), type != $type, dictionary.getDictionaryRank(type) < dictionary.getDictionaryRank($type), !resized, !skipRemoveEntitiesContainedInLarger)
|
$lowerRank: RedactionEntity(intersects($higherRank), type != $type, dictionary.getDictionaryRank(type) < dictionary.getDictionaryRank($type), !resized, !skipRemoveEntitiesContainedInLarger)
|
||||||
then
|
then
|
||||||
$lowerRank.removeFromGraph();
|
$lowerRank.removeFromGraph();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user