From 5c2596e268e860ddabb871dc642ec85a08bf9bcd Mon Sep 17 00:00:00 2001 From: Timo Date: Mon, 19 Apr 2021 13:08:32 +0300 Subject: [PATCH] Serialization of text --- .../classification/model/SectionText.java | 6 +++ .../classification/model/TextBlock.java | 2 + .../ObjectMapperConfiguration.java | 43 ++++++++++++++++ .../v1/server/parsing/model/RedMatrix.java | 10 ++++ .../server/parsing/model/RedTextPosition.java | 48 +++++++++++++++++ .../parsing/model/TextPositionSequence.java | 51 ++++++++++++++----- .../redaction/service/ReanalyzeService.java | 15 ++++++ .../service/RedactionLogCreatorService.java | 10 ++-- .../segmentation/PdfSegmentationService.java | 5 +- 9 files changed, 167 insertions(+), 23 deletions(-) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java index c9c88cec..f62091a7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java @@ -31,6 +31,12 @@ public class SectionText { private List cellStarts = new ArrayList<>(); + public void setTabularData(Map tabularData) { + tabularData.remove(null); + this.tabularData = tabularData; + } + + public SearchableText getSearchableText() { SearchableText searchableText = new SearchableText(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java index 6da9f6a0..d40f8671 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java @@ -6,6 +6,7 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; +import lombok.NoArgsConstructor; import java.util.ArrayList; import java.util.List; @@ -13,6 +14,7 @@ import java.util.List; @AllArgsConstructor @Builder @Data +@NoArgsConstructor public class TextBlock extends AbstractTextContainer { @Builder.Default diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java new file mode 100644 index 00000000..44d3c2b7 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/configuration/ObjectMapperConfiguration.java @@ -0,0 +1,43 @@ +package com.iqser.red.service.redaction.v1.server.configuration; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.Version; +import com.fasterxml.jackson.databind.JsonSerializer; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.module.SimpleModule; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Primary; + +import java.io.IOException; + +@Configuration +public class ObjectMapperConfiguration { + + + @Bean + @Primary + public ObjectMapper objectMapper() { + var objectMapper = new ObjectMapper(); + SimpleModule simpleModule = new SimpleModule("SimpleModule", + new Version(1, 0, 0, null)); +// simpleModule.addSerializer(new ItemSerializer()); + simpleModule.addSerializer(PDFont.class, new PDFontSerializer()); + simpleModule.addSerializer(PDTrueTypeFont.class, new PDFontSerializer()); + objectMapper.registerModule(simpleModule); + + return objectMapper; + } + + + public static class PDFontSerializer extends JsonSerializer { + + @Override + public void serialize(PDFont t, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException { + jsonGenerator.writeNull(); + } + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java new file mode 100644 index 00000000..f58c7475 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedMatrix.java @@ -0,0 +1,10 @@ +package com.iqser.red.service.redaction.v1.server.parsing.model; + +import lombok.Data; + + +@Data +public class RedMatrix { + + private float[] single; +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java new file mode 100644 index 00000000..effafad8 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java @@ -0,0 +1,48 @@ +package com.iqser.red.service.redaction.v1.server.parsing.model; + +import lombok.Data; +import lombok.SneakyThrows; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.util.Matrix; +import org.springframework.beans.BeanUtils; + +@Data +public class RedTextPosition { + + private Matrix textMatrix; + private float endX; + private float endY; + private float maxHeight; + private int rotation; + private float x; + private float y; + private float pageHeight; + private float pageWidth; + private float widthOfSpace; + private int[] charCodes; + private float fontSize; + private float fontSizePt; + private float[] widths; + private String unicode; + private float direction = -1.0F; + private float XDirAdj; + private float YDirAdj; + private float width; + private float heightDir; + private float fontSizeInPt; + private String fontName; + + + @SneakyThrows + public static RedTextPosition fromTextPosition(TextPosition textPosition) { + var pos = new RedTextPosition(); + BeanUtils.copyProperties(textPosition, pos); + pos.setFontName(textPosition.getFont().getName()); + + pos.setCharCodes(textPosition.getCharacterCodes()); + pos.setWidths(textPosition.getIndividualWidths()); + pos.setFontSizePt(textPosition.getFontSizeInPt()); + + return pos; + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java index c6181f4e..786e1483 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java @@ -3,26 +3,45 @@ package com.iqser.red.service.redaction.v1.server.parsing.model; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; import lombok.Data; -import lombok.RequiredArgsConstructor; +import lombok.NoArgsConstructor; import org.apache.pdfbox.text.TextPosition; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; @Data -@RequiredArgsConstructor +@NoArgsConstructor public class TextPositionSequence implements CharSequence { - private final int page; - private List textPositions = new ArrayList<>(); + private int page; + private List textPositions = new ArrayList<>(); + + + public TextPositionSequence(int page) { + this.page = page; + } + + + public static TextPositionSequence fromData(List textPositions, int page) { + var textPositionSequence = new TextPositionSequence(); + textPositionSequence.textPositions = textPositions; + textPositionSequence.page = page; + + return textPositionSequence; + } public TextPositionSequence(List textPositions, int page) { - this.textPositions = textPositions; + this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); this.page = page; } + public void setTextPositions(List textPositions) { + this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); + } + @Override public int length() { @@ -34,7 +53,7 @@ public class TextPositionSequence implements CharSequence { @Override public char charAt(int index) { - TextPosition textPosition = textPositionAt(index); + RedTextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return text.charAt(0); } @@ -42,7 +61,7 @@ public class TextPositionSequence implements CharSequence { public char charAt(int index, boolean caseInSensitive) { - TextPosition textPosition = textPositionAt(index); + RedTextPosition textPosition = textPositionAt(index); String text = textPosition.getUnicode(); return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0); } @@ -51,7 +70,7 @@ public class TextPositionSequence implements CharSequence { @Override public TextPositionSequence subSequence(int start, int end) { - return new TextPositionSequence(textPositions.subList(start, end), page); + return fromData(textPositions.subList(start, end), page); } @@ -66,18 +85,24 @@ public class TextPositionSequence implements CharSequence { } - public TextPosition textPositionAt(int index) { + public RedTextPosition textPositionAt(int index) { return textPositions.get(index); } - public void add(TextPosition textPosition) { + public void add(RedTextPosition textPosition) { this.textPositions.add(textPosition); } + public void add(TextPosition textPosition) { + + this.textPositions.add(RedTextPosition.fromTextPosition(textPosition)); + } + + public float getX1() { if (textPositions.get(0).getRotation() == 90) { @@ -145,9 +170,7 @@ public class TextPositionSequence implements CharSequence { public String getFont() { - return textPositions.get(0) - .getFont() - .toString() + return textPositions.get(0).getFontName() .toLowerCase() .replaceAll(",bold", "") .replaceAll(",italic", ""); @@ -156,7 +179,7 @@ public class TextPositionSequence implements CharSequence { public String getFontStyle() { - String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase(); + String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) { return "bold, italic"; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index 2ebc57f4..d610e94e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -13,11 +13,14 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; import org.kie.api.runtime.KieContainer; import org.springframework.stereotype.Service; import org.springframework.web.bind.annotation.RequestBody; import java.awt.geom.Rectangle2D; +import java.io.IOException; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -53,6 +56,18 @@ public class ReanalyzeService { log.info("Document structure analysis successful, starting redaction analysis..."); entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions()); + + var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN)); + + // TODO move this to where it makes sense - or remove completly + try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) { + pdDocument.setAllSecurityToBeRemoved(true); + pdfSegmentationService.postProcessSections(pdDocument, classifiedDoc.getSectionText()); + } catch (IOException e) { + e.printStackTrace(); + } + + redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest .getRuleSetId()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java index a046fb08..49bfe693 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java @@ -4,6 +4,7 @@ import com.iqser.red.service.redaction.v1.model.*; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; @@ -14,7 +15,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import lombok.RequiredArgsConstructor; import org.apache.commons.collections4.CollectionUtils; -import org.apache.pdfbox.text.TextPosition; import org.springframework.stereotype.Service; import java.util.ArrayList; @@ -272,24 +272,24 @@ public class RedactionLogCreatorService { } - private List getRectanglesPerLine(List textPositions, int page) { + private List getRectanglesPerLine(List textPositions, int page) { List rectangles = new ArrayList<>(); if (textPositions.size() == 1) { - rectangles.add(new TextPositionSequence(textPositions, page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle()); } else { float y = textPositions.get(0).getYDirAdj(); int startIndex = 0; for (int i = 1; i < textPositions.size(); i++) { float yDirAdj = textPositions.get(i).getYDirAdj(); if (yDirAdj != y) { - rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle()); y = yDirAdj; startIndex = i; } } if (startIndex != textPositions.size()) { - rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle()); + rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle()); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 2eb06c3d..12fc519a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -53,7 +53,7 @@ public class PdfSegmentationService { private final ImageClassificationService imageClassificationService; - private void postProcessSections(PDDocument pdDocument, List texts) { + public void postProcessSections(PDDocument pdDocument, List texts) { try { for (SectionText sectionText : texts) { @@ -194,9 +194,6 @@ public class PdfSegmentationService { pdDocument = reinitializePDDocument(tempFile, pdDocument); - // This can be improved an done in one pass, but it's complicated to do right away - postProcessSections(pdDocument, document.getSectionText()); - IOUtils.close(pdDocument); tempFile.delete();