Compare commits

...

2 Commits

Author SHA1 Message Date
Kilian Schuettler
1db515acf0 extended-entity-finding: increase threshold 2024-10-22 15:09:38 +02:00
Kilian Schuettler
f9e7703063 wip: prototype extended EntityFindingUtility 2024-10-22 15:08:51 +02:00
12 changed files with 303 additions and 45 deletions

View File

@ -61,7 +61,7 @@ dependencies {
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.ahocorasick:ahocorasick:0.6.3")
implementation("org.ahocorasick:ahocorasick:0.7.3")
implementation("org.javassist:javassist:3.29.2-GA")
implementation("org.drools:drools-engine:${droolsVersion}")
@ -129,6 +129,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
"BPE_APPEND_JAVA_TOOL_OPTIONS",
"-XX:MaxMetaspaceSize=1g -Dfile.encoding=UTF-8 -Dkie.repository.project.cache.size=50 -Dkie.repository.project.versions.cache.size=5"
)
environment.put("BPE_DEFAULT_LANG", "en_US.utf8")
environment.put("BPE_DEFAULT_LANG", "en_US.utf8") // java.text.Normalizer does not care for file.encoding
imageName.set("nexus.knecon.com:5001/red/${project.name}")// must build image with same name always, otherwise the builder will not know which image to use as cache. DO NOT CHANGE!

View File

@ -19,6 +19,7 @@ import lombok.Data;
public class SearchImplementation {
private boolean ignoreCase;
private boolean ignoreWhiteSpace;
private List<String> values;
private Pattern pattern;
@ -41,13 +42,22 @@ public class SearchImplementation {
}
public SearchImplementation(Collection<String> values, boolean ignoreCase, boolean ignoreWhiteSpace) {
this.values = new ArrayList<>(values);
this.ignoreCase = ignoreCase;
this.ignoreWhiteSpace = ignoreWhiteSpace;
this.createSearchImplementation();
}
private void createSearchImplementation() {
if (this.values.isEmpty()) {
return;
}
if (this.values.size() == 1) {
if (this.values.size() == 1 && !this.ignoreWhiteSpace) {
var text = this.values.iterator().next();
if (this.ignoreCase) {
@ -60,6 +70,9 @@ public class SearchImplementation {
if (this.ignoreCase) {
builder.ignoreCase();
}
if (this.ignoreWhiteSpace) {
builder.ignoreWhiteSpace();
}
builder.addKeywords(this.values);

View File

@ -264,7 +264,7 @@ public class AtomicTextBlock implements TextBlock {
}
private List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
protected List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
return getLineBreaks().stream()
.map(linebreak -> linebreak + this.textRange.start())

View File

@ -0,0 +1,126 @@
package com.iqser.red.service.redaction.v1.server.model.document.textblock;
import java.awt.geom.Rectangle2D;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class PartialTextBlock implements TextBlock {
AtomicTextBlock atomicTextBlock;
TextRange textRange;
@Override
public String getSearchText() {
return atomicTextBlock.getSearchText().substring(textRange.start(), textRange.end());
}
@Override
public String getSearchTextLowerCase() {
return atomicTextBlock.getSearchTextLowerCase().substring(textRange.start(), textRange.end());
}
@Override
public List<String> getWords() {
String searchText = getSearchText();
List<String> words = new ArrayList<>();
BreakIterator iterator = BreakIterator.getWordInstance(Locale.ENGLISH);
iterator.setText(getSearchText());
int start = iterator.first();
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
words.add(searchText.substring(start, end));
}
return words;
}
@Override
public List<AtomicTextBlock> getAtomicTextBlocks() {
return List.of(atomicTextBlock);
}
@Override
public int getNextLinebreak(int fromIndex) {
return atomicTextBlock.getNextLinebreak(fromIndex);
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return atomicTextBlock.getPreviousLinebreak(fromIndex);
}
@Override
public TextRange getLineTextRange(int lineNumber) {
return atomicTextBlock.getLineTextRange(lineNumber);
}
@Override
public List<Integer> getLineBreaks() {
return atomicTextBlock.getAllLineBreaksInBoundary(textRange);
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return atomicTextBlock.getPosition(stringIdx);
}
@Override
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
return atomicTextBlock.getPositions(stringTextRange);
}
@Override
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
return atomicTextBlock.getPositionsPerPage(stringTextRange);
}
@Override
public String subSequenceWithLineBreaks(TextRange textRange) {
return atomicTextBlock.subSequenceWithLineBreaks(textRange);
}
@Override
public int numberOfLines() {
return getLineBreaks().size() + 1;
}
}

View File

@ -19,6 +19,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntryState;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Position;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.ChangeFactory;
import com.iqser.red.service.redaction.v1.server.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage;
@ -33,6 +34,7 @@ import lombok.extern.slf4j.Slf4j;
public class NotFoundImportedEntitiesService {
public static final String IMPORTED_REDACTION_TYPE = "imported_redaction";
private final RedactionServiceSettings settings;
@Timed("redactmanager_processEntityLog")
@ -95,7 +97,7 @@ public class NotFoundImportedEntitiesService {
}
entityLogEntry.getImportedRedactionIntersections().add(precursorEntity.getId());
if (entityLogEntry.getState() != EntryState.REMOVED) {
if (entityLogEntry.getState() != EntryState.REMOVED && !settings.isAnnotationMode()) {
entityLogEntry.setState(EntryState.REMOVED);
entityLogEntry.getChanges().add(ChangeFactory.toChange(ChangeType.REMOVED, OffsetDateTime.now(), analysisNumber));
}

View File

@ -348,16 +348,7 @@ public class ComponentCreationService {
*/
public void create(String ruleIdentifier, String name, String value, String valueDescription, Entity reference) {
referencedEntities.add(reference);
List<Entity> referenceList = new LinkedList<>();
referenceList.add(reference);
kieSession.insert(Component.builder()
.matchedRule(RuleIdentifier.fromString(ruleIdentifier))
.name(name)
.value(value)
.valueDescription(valueDescription)
.references(referenceList)
.build());
create(ruleIdentifier, name, value, valueDescription, List.of(reference));
}

View File

@ -4,6 +4,7 @@ import static java.lang.String.format;
import static java.util.stream.Collectors.groupingBy;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
@ -14,6 +15,7 @@ import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
@ -22,14 +24,18 @@ import com.iqser.red.service.redaction.v1.server.model.ClosestEntity;
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage;
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
import com.iqser.red.service.redaction.v1.server.model.document.entity.PositionOnPage;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.PartialTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
import com.iqser.red.service.redaction.v1.server.utils.TextNormalizationUtilities;
import lombok.extern.slf4j.Slf4j;
@ -37,6 +43,7 @@ import lombok.extern.slf4j.Slf4j;
@Service
public class EntityFindingUtility {
private static final float STRING_SIMILARITY_THRESHOLD = 0.2f;
EntityCreationService entityCreationService;
@ -55,7 +62,7 @@ public class EntityFindingUtility {
return Optional.empty();
}
List<TextEntity> possibleEntities = entitiesWithSameValue.get(precursorEntity.getValue().toLowerCase(Locale.ENGLISH));
List<TextEntity> possibleEntities = entitiesWithSameValue.get(TextNormalizationUtilities.removeAllWhitespaces(precursorEntity.getValue().toLowerCase(Locale.ENGLISH)));
if (entityIdentifierValueNotFound(possibleEntities)) {
log.info("Entity could not be created with precursorEntity: {}, due to the value {} not being found anywhere.", precursorEntity, precursorEntity.getValue());
@ -91,6 +98,79 @@ public class EntityFindingUtility {
}
public Optional<TextEntity> findEntityByUnderlyingText(PrecursorEntity precursorEntity, Document document) {
if (precursorEntity.getEntityPosition().isEmpty()) {
return Optional.empty();
}
Optional<Page> optionalPage = document.getPages()
.stream()
.filter(docPage -> docPage.getNumber()
.equals(precursorEntity.getEntityPosition()
.get(0).pageNumber()))
.findFirst();
if (optionalPage.isEmpty()) {
return Optional.empty();
}
Page page = optionalPage.get();
Rectangle2D rect = precursorEntity.getEntityPosition()
.stream()
.map(RectangleWithPage::rectangle2D)
.collect(RectangleTransformations.collectBBox());
Optional<AtomicTextBlock> intersectingTbOptional = page.getTextBlocksOnPage()
.stream()
.filter(tb -> RectangleTransformations.rectangle2DBBox(tb.getPositions()).intersects(rect))
.findFirst();
if (intersectingTbOptional.isEmpty()) {
return Optional.empty();
}
AtomicTextBlock intersectingTb = intersectingTbOptional.get();
List<PartialTextBlock> underlyingTextRuns = findUnderlyingCharacterRuns(intersectingTb, rect);
for (PartialTextBlock underlyingText : underlyingTextRuns) {
int threshold = (int) (Math.min(underlyingText.length(), precursorEntity.length()) * STRING_SIMILARITY_THRESHOLD) + 1;
int distance = new LevenshteinDistance(threshold).apply(underlyingText.getSearchText(), precursorEntity.getValue());
if (distance >= 0) {
return entityCreationService.byTextRangeWithEngine(underlyingText.getTextRange(), "temp", EntityType.ENTITY, document, Collections.emptySet());
}
}
return Optional.empty();
}
private static List<PartialTextBlock> findUnderlyingCharacterRuns(AtomicTextBlock intersectingTb, Rectangle2D rect) {
List<PartialTextBlock> intersectingTextBlocks = new ArrayList<>();
int first = -1;
int last = -1;
for (int i = 0; i < intersectingTb.getPositions().size(); i++) {
Rectangle2D rectangle2D = intersectingTb.getPosition(i + intersectingTb.getTextRange().start());
if (rectangle2D.intersects(rect)) {
if (first == -1) {
first = i;
}
last = i;
} else if (first != -1) {
intersectingTextBlocks.add(new PartialTextBlock(intersectingTb, new TextRange(first, last)));
first = -1;
last = -1;
}
}
if (first != -1) {
intersectingTextBlocks.add(new PartialTextBlock(intersectingTb, new TextRange(first, last)));
}
return intersectingTextBlocks;
}
private static boolean entityIdentifierValueNotFound(List<TextEntity> possibleEntities) {
return possibleEntities == null || possibleEntities.isEmpty();
@ -183,7 +263,7 @@ public class EntityFindingUtility {
SearchImplementation searchImplementation = new SearchImplementation(entryValues.stream()
.map(String::trim)
.collect(Collectors.toSet()), true);
.collect(Collectors.toSet()), true, true);
List<TextBlock> textBlocks = node.getTextBlocksByPageNumbers(pageNumbers);
@ -193,7 +273,7 @@ public class EntityFindingUtility {
.filter(Optional::isPresent)
.map(Optional::get)
.distinct()
.collect(groupingBy(entity -> entity.getValue().toLowerCase(Locale.ROOT)));
.collect(groupingBy(entity -> TextNormalizationUtilities.removeAllWhitespaces(entity.getValue().toLowerCase(Locale.ROOT))));
}

View File

@ -34,7 +34,7 @@ import lombok.extern.slf4j.Slf4j;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class EntityFromPrecursorCreationService {
static double MATCH_THRESHOLD = 10; // Is compared to the average sum of distances in pdf coordinates for each corner of the bounding box of the entities
static double MATCH_THRESHOLD = 100; // Is compared to the average sum of distances in pdf coordinates for each corner of the bounding box of the entities
EntityFindingUtility entityFindingUtility;
DictionaryService dictionaryService;
RedactionServiceSettings settings;
@ -130,10 +130,8 @@ public class EntityFromPrecursorCreationService {
} else {
String section = precursorEntity.getManualOverwrite().getSection()
.orElse(null);
if ((section == null || section.isBlank())
&& precursorEntity.getSection() != null
&& !precursorEntity.getSection().isBlank()
&& precursorEntity.getEngines().contains(Engine.IMPORTED)) {
if ((section == null || section.isBlank()) && precursorEntity.getSection() != null && !precursorEntity.getSection().isBlank() && precursorEntity.getEngines()
.contains(Engine.IMPORTED)) {
section = precursorEntity.getSection();
}

View File

@ -1,31 +1,47 @@
package com.iqser.red.service.redaction.v1.server.utils;
import java.util.regex.Pattern;
import lombok.experimental.UtilityClass;
@UtilityClass
public final class TextNormalizationUtilities {
/**
* Revert hyphenation due to line breaks.
*
* @param text Text to be processed.
* @return Text without line-break hyphenation.
*/
public static String removeHyphenLineBreaks(String text) {
public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
public static final Pattern WHITESPACE_REMOVAL = Pattern.compile("\\s+");
return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1");
public String cleanString(String value) {
String noHyphenLinebreaks = removeHyphenLinebreaks(value);
String noLinebreaks = removeLinebreaks(noHyphenLinebreaks);
return removeMultipleWhitespaces(noLinebreaks);
}
public static String removeLineBreaks(String text) {
public String removeHyphenLinebreaks(String value) {
return text.replaceAll("\n", " ");
return hyphenLineBreaks.matcher(value).replaceAll("");
}
public static String removeRepeatingWhitespaces(String text) {
private String removeMultipleWhitespaces(String value) {
return text.replaceAll(" {2}", " ");
return doubleWhitespaces.matcher(value).replaceAll(" ");
}
private String removeLinebreaks(String value) {
return linebreaks.matcher(value).replaceAll(" ");
}
public String removeAllWhitespaces(String value) {
return WHITESPACE_REMOVAL.matcher(value).replaceAll("");
}
}

View File

@ -611,7 +611,7 @@ public abstract class AbstractRedactionIntegrationTest {
private String cleanDictionaryEntry(String entry) {
return TextNormalizationUtilities.removeHyphenLineBreaks(entry).replaceAll("\\n", " ");
return TextNormalizationUtilities.cleanString(entry);
}

View File

@ -6,8 +6,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import java.io.BufferedWriter;
@ -37,7 +35,6 @@ import org.junit.jupiter.api.Order;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestMethodOrder;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
@ -83,7 +80,6 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
import com.iqser.red.service.redaction.v1.server.rules.RulesIntegrationTest;
import com.iqser.red.service.redaction.v1.server.service.document.DocumentGraphMapper;
import com.iqser.red.service.redaction.v1.server.storage.DocumentDataFallbackService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
@ -102,7 +98,6 @@ public class RedactionIntegrationTest extends RulesIntegrationTest {
private static final String RULES = loadFromClassPath("drools/rules.drl");
@Configuration
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
@Import({LayoutParsingServiceProcessorConfiguration.class})
@ -1233,6 +1228,44 @@ public class RedactionIntegrationTest extends RulesIntegrationTest {
}
@Test
@Disabled
public void testImportedRedactions2() throws IOException {
String outputFileName = OsUtils.getTemporaryDirectory() + "/ImportedRedactions.pdf";
ClassPathResource importedRedactions = new ClassPathResource("files/ImportedRedactions/76c5683ebc8c19dc23eccea12dfc652b.IMPORTED_REDACTIONS.json");
AnalyzeRequest request = uploadFileToStorage("files/ImportedRedactions/76c5683ebc8c19dc23eccea12dfc652b.ORIGIN.pdf");
storageService.storeObject(TenantContext.getTenantId(),
StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.IMPORTED_REDACTIONS),
importedRedactions.getInputStream());
analyzeDocumentStructure(LayoutParsingType.DOCUMINE_OLD, request);
AnalyzeResult result = analyzeService.analyze(request);
var entityLog = redactionStorageService.getEntityLog(TEST_DOSSIER_ID, TEST_FILE_ID);
AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder().dossierId(TEST_DOSSIER_ID).fileId(TEST_FILE_ID).build());
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
fileOutputStream.write(annotateResponse.getDocument());
}
entityLog.getEntityLogEntry()
.forEach(entry -> {
if (entry.getValue() == null) {
return;
}
if (entry.getValue().equals("David")) {
assertThat(entry.getImportedRedactionIntersections()).hasSize(1);
}
if (entry.getValue().equals("annotation")) {
assertThat(entry.getImportedRedactionIntersections()).isEmpty();
}
});
}
@Test
public void testExpandByPrefixRegEx() throws IOException {
@ -2252,9 +2285,7 @@ public class RedactionIntegrationTest extends RulesIntegrationTest {
@Test
public void testFileWithImagesAndNoText() {
AnalyzeRequest request = prepareStorage("files/new/only_images.pdf",
"files/cv_service_empty_response.json",
"files/only_images_file_image_response.json");
AnalyzeRequest request = prepareStorage("files/new/only_images.pdf", "files/cv_service_empty_response.json", "files/only_images_file_image_response.json");
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
AnalyzeResult result = analyzeService.analyze(request);

View File

@ -11,10 +11,10 @@ public class TextNormalizationUtilitiesTest {
public void testHyphenRemoval() {
String test = "Without these peo-\nple, this conference would not happen";
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLineBreaks(test)).contains("people");
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLinebreaks(test)).contains("people");
test = "Die\t\nFreiwillige\t Versicherung\t endet\t zudem\t für\t den\t ein\u00AD\nzelnen\tVersicherten\tmit\tder\tAufhebung\tdes\tVertra-\nges,\t seiner\t Unterstellung\t unter\t die\t obligatorische\t\nVersicherung\t oder\t seinem\t Ausschluss.";
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLineBreaks(test)).contains("einzelnen", "Vertrages");
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLinebreaks(test)).contains("einzelnen", "Vertrages");
}