diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java index 7a4d90ad..cd7b8c95 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RedactionLogEntry.java @@ -34,4 +34,7 @@ public class RedactionLogEntry { private ManualRedactionType manualRedactionType; private boolean isDictionaryEntry; + private String textBefore; + private String textAfter; + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java index d3b67e99..d9a197a9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java @@ -24,7 +24,6 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.visualization.service.AnnotationHighlightService; -import com.iqser.red.service.redaction.v1.server.visualization.service.PdfFlattenService; import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService; import lombok.RequiredArgsConstructor; @@ -37,7 +36,6 @@ public class RedactionController implements RedactionResource { private final PdfSegmentationService pdfSegmentationService; private final AnnotationHighlightService annotationHighlightService; private final EntityRedactionService entityRedactionService; - private final PdfFlattenService pdfFlattenService; private final DroolsExecutionService droolsExecutionService; @@ -52,12 +50,6 @@ public class RedactionController implements RedactionResource { annotationHighlightService.highlight(pdDocument, classifiedDoc, redactionRequest.isFlatRedaction(), redactionRequest .getManualRedactions()); - if (redactionRequest.isFlatRedaction()) { - PDDocument flatDocument = pdfFlattenService.flattenPDF(pdDocument); - return convert(flatDocument, classifiedDoc.getPages() - .size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid(), classifiedDoc.getDictionaryVersion(), classifiedDoc.getRulesVersion()); - } - return convert(pdDocument, classifiedDoc.getPages() .size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid(), classifiedDoc.getDictionaryVersion(), classifiedDoc.getRulesVersion()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java index 3f511e97..5f301081 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java @@ -35,8 +35,11 @@ public class Entity { private boolean isDictionaryEntry; + private String textBefore; + private String textAfter; - public Entity(String word, String type, boolean redaction, String redactionReason, List positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry) { + + public Entity(String word, String type, boolean redaction, String redactionReason, List positionSequences, String headline, int matchedRule, int sectionNumber, String legalBasis, boolean isDictionaryEntry, String textBefore, String textAfter) { this.word = word; this.type = type; @@ -48,6 +51,8 @@ public class Entity { this.sectionNumber = sectionNumber; this.legalBasis = legalBasis; this.isDictionaryEntry = isDictionaryEntry; + this.textBefore = textBefore; + this.textAfter = textAfter; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 346e3569..63ecd2f5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -41,6 +41,7 @@ public class EntityRedactionService { private final DictionaryService dictionaryService; private final DroolsExecutionService droolsExecutionService; + private final SurroundingWordsService surroundingWordsService; public void processDocument(Document classifiedDoc, ManualRedactions manualRedactions) { @@ -83,7 +84,7 @@ public class EntityRedactionService { .computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity - .getLegalBasis(), entity.isDictionaryEntry())); + .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter())); } } @@ -94,9 +95,8 @@ public class EntityRedactionService { } - private Set findEntities(Document classifiedDoc, ManualRedactions manualRedactions, - Dictionary dictionary, boolean local, - Map> hintsPerSectionNumber) { + private Set findEntities(Document classifiedDoc, ManualRedactions manualRedactions, Dictionary dictionary, + boolean local, Map> hintsPerSectionNumber) { Set documentEntities = new HashSet<>(); int sectionNumber = 1; @@ -113,6 +113,7 @@ public class EntityRedactionService { SearchableText searchableRow = new SearchableText(); Map tabularData = new HashMap<>(); int start = 0; + List cellStarts = new ArrayList<>(); for (Cell cell : row) { if (!singleCellTable && cell.isHeaderCell() || CollectionUtils.isEmpty(cell.getTextBlocks())) { continue; @@ -128,12 +129,17 @@ public class EntityRedactionService { .replaceAll("-", ""); tabularData.put(headerName, new CellValue(cell.getTextBlocks(), cellStart)); }); - start = start + cell.toString().length() + 1; + for (TextBlock textBlock : cell.getTextBlocks()) { + // TODO avoid cell overlap merging. searchableRow.addAll(textBlock.getSequences()); } + cellStarts.add(cellStart); + start = start + cell.toString().trim().length() + 1; } - Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary.getDictionaryModels(), local); + Set rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary + .getDictionaryModels(), local); + surroundingWordsService.addSurroundingText(rowEntities, searchableRow, dictionary, cellStarts); sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() .isLocal(local) @@ -155,6 +161,7 @@ public class EntityRedactionService { addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber); Set entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary.getDictionaryModels(), local); + surroundingWordsService.addSurroundingText(entities, searchableText, dictionary); sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() .isLocal(local) @@ -176,20 +183,20 @@ public class EntityRedactionService { documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), sectionSearchableTextPair.getSearchableText(), dictionary)); analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> { - if (dictionary.isRecommendation(key)){ + if (dictionary.isRecommendation(key)) { analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> { - if (!dictionary.containsValue(key, value)){ + if (!dictionary.containsValue(key, value)) { dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value); } }); } else { - analysedRowSection.getLocalDictionaryAdds().get(key).forEach( value -> { + analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> { - if(dictionary.getLocalAccessMap().get(key) == null){ + if (dictionary.getLocalAccessMap().get(key) == null) { log.warn("Dictionary {} is null", key); } - if(dictionary.getLocalAccessMap().get(key).getLocalEntries() == null){ + if (dictionary.getLocalAccessMap().get(key).getLocalEntries() == null) { log.warn("Dictionary {} localEntries is null", key); } @@ -198,7 +205,6 @@ public class EntityRedactionService { } }); - }); return documentEntities; @@ -243,13 +249,14 @@ public class EntityRedactionService { } - private Set find(String inputString, Set values, String type, String headline, int sectionNumber, boolean local) { + private Set find(String inputString, Set values, String type, String headline, int sectionNumber, + boolean local) { Set found = new HashSet<>(); for (String value : values) { - if(value.trim().length() <= 2) { + if (value.trim().length() <= 2) { continue; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SurroundingWordsService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SurroundingWordsService.java new file mode 100644 index 00000000..4ad64759 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SurroundingWordsService.java @@ -0,0 +1,140 @@ +package com.iqser.red.service.redaction.v1.server.redaction.service; + +import java.util.List; +import java.util.Set; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; +import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class SurroundingWordsService { + + private final RedactionServiceSettings redactionServiceSettings; + + + public void addSurroundingText(Set entities, SearchableText searchableText, Dictionary dictionary) { + + if (entities.isEmpty()) { + return; + } + + try { + for (Entity entity : entities) { + + if (dictionary.isHint(entity.getType())) { + continue; + } + findSurroundingWords(entity, searchableText.toString(), entity.getStart(), entity.getEnd()); + } + } catch (Exception e) { + log.warn("Could not get surrounding text!"); + } + } + + + public void addSurroundingText(Set entities, SearchableText searchableText, Dictionary dictionary, + List cellstarts) { + + if (entities.isEmpty()) { + return; + } + + try { + String searchableString = searchableText.toString(); + + if (cellstarts != null) { + for (int i = 0; i < cellstarts.size(); i++) { + + int startOffset = cellstarts.get(i); + int endOffset = -1; + + if (i + 1 < cellstarts.size()) { + endOffset = cellstarts.get(i + 1); + } else { + endOffset = searchableString.length() - 1; + } + + String text = searchableString.substring(startOffset, endOffset); + for (Entity entity : entities) { + + if (dictionary.isHint(entity.getType())) { + continue; + } + + if (entity.getStart() >= startOffset && entity.getEnd() <= endOffset) { + int entityStartOffset = entity.getStart() - startOffset; + int entityEndOffset = entity.getEnd() - startOffset; + findSurroundingWords(entity, text, entityStartOffset, entityEndOffset); + } + } + } + } + } catch (Exception e) { + log.warn("Could not get surrounding text!"); + } + } + + + private void findSurroundingWords(Entity entity, String text, int entityStartOffset, int entityEndOffset) { + + int offsetBefore = entityStartOffset - redactionServiceSettings.getSurroundingWordsOffsetWindow() < 0 ? 0 : entityStartOffset - redactionServiceSettings + .getSurroundingWordsOffsetWindow(); + String textBefore = text.substring(offsetBefore, entityStartOffset); + if (!textBefore.isBlank()) { + String[] wordsBefore = textBefore.split(" "); + int numberOfWordsBefore = wordsBefore.length > redactionServiceSettings.getNumberOfSurroundingWords() ? redactionServiceSettings + .getNumberOfSurroundingWords() : wordsBefore.length; + if (wordsBefore.length > 0) { + entity.setTextBefore(concatWordsBefore(wordsBefore, numberOfWordsBefore)); + } + } + + int endOffset = entityEndOffset + redactionServiceSettings.getSurroundingWordsOffsetWindow() > text.length() ? text + .length() : entityEndOffset + redactionServiceSettings.getSurroundingWordsOffsetWindow(); + String textAfter = text.substring(entityEndOffset, endOffset); + if (!textAfter.isBlank()) { + String[] wordsAfter = textAfter.split(" "); + int numberOfWordsAfter = wordsAfter.length > redactionServiceSettings.getNumberOfSurroundingWords() ? redactionServiceSettings + .getNumberOfSurroundingWords() : wordsAfter.length; + if (wordsAfter.length > 0) { + entity.setTextAfter(concatWordsAfter(wordsAfter, numberOfWordsAfter)); + } + } + } + + + private String concatWordsBefore(String[] words, int number) { + + StringBuilder sb = new StringBuilder(); + + int startNumber = words.length > number ? words.length - number : 0; + + for (int i = startNumber; i < words.length; i++) { + sb.append(words[i]).append(" "); + } + + return sb.toString().trim(); + } + + + private String concatWordsAfter(String[] words, int number) { + + StringBuilder sb = new StringBuilder(); + + for (int i = 0; i < number; i++) { + sb.append(words[i]).append(" "); + } + + return sb.toString().trim(); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java index 5405c047..374eebd3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java @@ -11,7 +11,7 @@ public class TextNormalizationUtilities { * @return Text without line-break hyphenation. */ public static String removeHyphenLineBreaks(String text) { - return text.replaceAll("\\s(\\S+)[\\-\\u00AD]\\R|\n\r(.+ )", "\n$1$2"); + return text.replaceAll("([^\\s\\d\\-]{2,})[\\-\\u00AD]\\R|\n\r(.+ )", "$1$2"); } } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java index fea4bf25..df95e899 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/settings/RedactionServiceSettings.java @@ -7,12 +7,9 @@ import lombok.Data; @Data @ConfigurationProperties("redaction-service") public class RedactionServiceSettings { + + private int numberOfSurroundingWords = 3; - /** - * Tenant used in single tenant mode. - */ - private String defaultTenant = "iqser-id"; - - private int flattenImageDpi = 100; + private int surroundingWordsOffsetWindow = 100; } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java index aaecbc20..a7c4fbba 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java @@ -42,10 +42,13 @@ public class Cell extends Rectangle { StringBuilder sb = new StringBuilder(); + Iterator itty = textBlocks.iterator(); + TextPositionSequence previous = null; while (itty.hasNext()) { + TextBlock textBlock = itty.next(); - TextPositionSequence previous = null; + for (TextPositionSequence word : textBlock.getSequences()) { if (previous != null) { if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) { @@ -57,9 +60,7 @@ public class Cell extends Rectangle { sb.append(word.toString()); previous = word; } - if (itty.hasNext()) { - sb.append(' '); - } + } return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()) @@ -67,4 +68,8 @@ public class Cell extends Rectangle { .replaceAll(" {2}", " "); } + + + + } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/AnnotationHighlightService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/AnnotationHighlightService.java index 388af0df..a8f07cea 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/AnnotationHighlightService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/AnnotationHighlightService.java @@ -276,6 +276,8 @@ public class AnnotationHighlightService { .sectionNumber(entity.getSectionNumber()) .matchedRule(entity.getMatchedRule()) .isDictionaryEntry(entity.isDictionaryEntry()) + .textAfter(entity.getTextAfter()) + .textBefore(entity.getTextBefore()) .build(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfFlattenService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfFlattenService.java deleted file mode 100644 index 03a97b2e..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfFlattenService.java +++ /dev/null @@ -1,68 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.visualization.service; - -import java.awt.image.BufferedImage; -import java.io.IOException; - -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; -import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.rendering.ImageType; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.springframework.stereotype.Service; - -import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; - -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@Service -@RequiredArgsConstructor -public class PdfFlattenService { - - private final RedactionServiceSettings settings; - - public PDDocument flattenPDF(PDDocument sourceDoc) throws IOException { - - PDDocument destDoc = new PDDocument(); - - PDFRenderer pdfRenderer = new PDFRenderer(sourceDoc); - - final int pageCount = sourceDoc.getDocumentCatalog().getPages().getCount(); - - log.info(pageCount + " page" + (pageCount == 1 ? "" : "s") + " to flatten."); - - for (int i = 0; i < pageCount; i += 1) { - - log.info("Flattening page " + (i + 1) + " of " + pageCount + "..."); - - BufferedImage img = pdfRenderer.renderImageWithDPI(i, settings.getFlattenImageDpi(), ImageType.RGB); - - log.info("Image rendered in memory (" + img.getWidth() + "x" + img.getHeight() + " " + settings.getFlattenImageDpi() + "DPI). Adding to PDF..."); - - PDPage imagePage = new PDPage(new PDRectangle(img.getWidth(), img.getHeight())); - destDoc.addPage(imagePage); - - PDImageXObject imgObj = LosslessFactory.createFromImage(destDoc, img); - - PDPageContentStream imagePageContentStream = new PDPageContentStream(destDoc, imagePage); - imagePageContentStream.drawImage(imgObj, 0, 0); - - log.info("Image added successfully."); - - imagePageContentStream.close(); - - img.flush(); - } - - log.info("New flattened PDF created in memory."); - - sourceDoc.close(); - - return destDoc; - } - -} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index a10e8ae1..ffed68e6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -382,7 +382,7 @@ public class RedactionIntegrationTest { System.out.println("redactionTest"); long start = System.currentTimeMillis(); - ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf"); RedactionRequest request = RedactionRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) @@ -391,11 +391,11 @@ public class RedactionIntegrationTest { RedactionResult result = redactionController.redact(request); - result.getRedactionLog().getRedactionLogEntry().forEach(entry -> { - if(entry.isDictionaryEntry()){ - System.out.println(entry.getValue()); - } - }); +// result.getRedactionLog().getRedactionLogEntry().forEach(entry -> { +// if(!entry.isHint()){ +// System.out.println(entry.getPositions().get(0).getPage() +":"+ entry.getTextBefore() +"--->"+ entry.getValue() + "--->" + entry.getTextAfter()); +// } +// }); try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Redacted.pdf")) { fileOutputStream.write(result.getDocument()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java deleted file mode 100644 index 5405c047..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilities.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.redaction.utils; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class TextNormalizationUtilities { - - /** - * Revert hyphenation due to line breaks. - * @param text Text to be processed. - * @return Text without line-break hyphenation. - */ - public static String removeHyphenLineBreaks(String text) { - return text.replaceAll("\\s(\\S+)[\\-\\u00AD]\\R|\n\r(.+ )", "\n$1$2"); - } - -} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilitiesTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilitiesTest.java index e4b04ae1..a4c1341e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilitiesTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/TextNormalizationUtilitiesTest.java @@ -10,11 +10,11 @@ public class TextNormalizationUtilitiesTest { String test = "Without these peo-\nple, this conference would not happen"; Assertions.assertThat(TextNormalizationUtilities.removeHyphenLineBreaks(test)) - .contains("\npeople"); + .contains("people"); test = "Die\t\nFreiwillige\t Versicherung\t endet\t zudem\t für\t den\t ein\u00AD\nzelnen\tVersicherten\tmit\tder\tAufhebung\tdes\tVertra-\nges,\t seiner\t Unterstellung\t unter\t die\t obligatorische\t\nVersicherung\t oder\t seinem\t Ausschluss."; Assertions.assertThat(TextNormalizationUtilities.removeHyphenLineBreaks(test)) - .contains("\neinzelnen", "\nVertrages"); + .contains("einzelnen", "Vertrages"); }