Serialization of text

This commit is contained in:
Timo 2021-04-19 13:08:32 +03:00
parent 42fcea85d3
commit 5c2596e268
9 changed files with 167 additions and 23 deletions

View File

@ -31,6 +31,12 @@ public class SectionText {
private List<Integer> cellStarts = new ArrayList<>();
public void setTabularData(Map<String, CellValue> tabularData) {
tabularData.remove(null);
this.tabularData = tabularData;
}
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();

View File

@ -6,6 +6,7 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@ -13,6 +14,7 @@ import java.util.List;
@AllArgsConstructor
@Builder
@Data
@NoArgsConstructor
public class TextBlock extends AbstractTextContainer {
@Builder.Default

View File

@ -0,0 +1,43 @@
package com.iqser.red.service.redaction.v1.server.configuration;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.Version;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.module.SimpleModule;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Primary;
import java.io.IOException;
@Configuration
public class ObjectMapperConfiguration {
@Bean
@Primary
public ObjectMapper objectMapper() {
var objectMapper = new ObjectMapper();
SimpleModule simpleModule = new SimpleModule("SimpleModule",
new Version(1, 0, 0, null));
// simpleModule.addSerializer(new ItemSerializer());
simpleModule.addSerializer(PDFont.class, new PDFontSerializer());
simpleModule.addSerializer(PDTrueTypeFont.class, new PDFontSerializer());
objectMapper.registerModule(simpleModule);
return objectMapper;
}
public static class PDFontSerializer extends JsonSerializer<PDFont> {
@Override
public void serialize(PDFont t, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException {
jsonGenerator.writeNull();
}
}
}

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import lombok.Data;
@Data
public class RedMatrix {
private float[] single;
}

View File

@ -0,0 +1,48 @@
package com.iqser.red.service.redaction.v1.server.parsing.model;
import lombok.Data;
import lombok.SneakyThrows;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import org.springframework.beans.BeanUtils;
@Data
public class RedTextPosition {
private Matrix textMatrix;
private float endX;
private float endY;
private float maxHeight;
private int rotation;
private float x;
private float y;
private float pageHeight;
private float pageWidth;
private float widthOfSpace;
private int[] charCodes;
private float fontSize;
private float fontSizePt;
private float[] widths;
private String unicode;
private float direction = -1.0F;
private float XDirAdj;
private float YDirAdj;
private float width;
private float heightDir;
private float fontSizeInPt;
private String fontName;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName());
pos.setCharCodes(textPosition.getCharacterCodes());
pos.setWidths(textPosition.getIndividualWidths());
pos.setFontSizePt(textPosition.getFontSizeInPt());
return pos;
}
}

View File

@ -3,26 +3,45 @@ package com.iqser.red.service.redaction.v1.server.parsing.model;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.NoArgsConstructor;
import org.apache.pdfbox.text.TextPosition;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
@Data
@RequiredArgsConstructor
@NoArgsConstructor
public class TextPositionSequence implements CharSequence {
private final int page;
private List<TextPosition> textPositions = new ArrayList<>();
private int page;
private List<RedTextPosition> textPositions = new ArrayList<>();
public TextPositionSequence(int page) {
this.page = page;
}
public static TextPositionSequence fromData(List<RedTextPosition> textPositions, int page) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions;
textPositionSequence.page = page;
return textPositionSequence;
}
public TextPositionSequence(List<TextPosition> textPositions, int page) {
this.textPositions = textPositions;
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
}
public void setTextPositions(List<TextPosition> textPositions) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
}
@Override
public int length() {
@ -34,7 +53,7 @@ public class TextPositionSequence implements CharSequence {
@Override
public char charAt(int index) {
TextPosition textPosition = textPositionAt(index);
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
@ -42,7 +61,7 @@ public class TextPositionSequence implements CharSequence {
public char charAt(int index, boolean caseInSensitive) {
TextPosition textPosition = textPositionAt(index);
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
}
@ -51,7 +70,7 @@ public class TextPositionSequence implements CharSequence {
@Override
public TextPositionSequence subSequence(int start, int end) {
return new TextPositionSequence(textPositions.subList(start, end), page);
return fromData(textPositions.subList(start, end), page);
}
@ -66,18 +85,24 @@ public class TextPositionSequence implements CharSequence {
}
public TextPosition textPositionAt(int index) {
public RedTextPosition textPositionAt(int index) {
return textPositions.get(index);
}
public void add(TextPosition textPosition) {
public void add(RedTextPosition textPosition) {
this.textPositions.add(textPosition);
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
}
public float getX1() {
if (textPositions.get(0).getRotation() == 90) {
@ -145,9 +170,7 @@ public class TextPositionSequence implements CharSequence {
public String getFont() {
return textPositions.get(0)
.getFont()
.toString()
return textPositions.get(0).getFontName()
.toLowerCase()
.replaceAll(",bold", "")
.replaceAll(",italic", "");
@ -156,7 +179,7 @@ public class TextPositionSequence implements CharSequence {
public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase();
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";

View File

@ -13,11 +13,14 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@ -53,6 +56,18 @@ public class ReanalyzeService {
log.info("Document structure analysis successful, starting redaction analysis...");
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
// TODO move this to where it makes sense - or remove completly
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
pdDocument.setAllSecurityToBeRemoved(true);
pdfSegmentationService.postProcessSections(pdDocument, classifiedDoc.getSectionText());
} catch (IOException e) {
e.printStackTrace();
}
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
.getRuleSetId());

View File

@ -4,6 +4,7 @@ import com.iqser.red.service.redaction.v1.model.*;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
@ -14,7 +15,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
@ -272,24 +272,24 @@ public class RedactionLogCreatorService {
}
private List<Rectangle> getRectanglesPerLine(List<TextPosition> textPositions, int page) {
private List<Rectangle> getRectanglesPerLine(List<RedTextPosition> textPositions, int page) {
List<Rectangle> rectangles = new ArrayList<>();
if (textPositions.size() == 1) {
rectangles.add(new TextPositionSequence(textPositions, page).getRectangle());
rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle());
} else {
float y = textPositions.get(0).getYDirAdj();
int startIndex = 0;
for (int i = 1; i < textPositions.size(); i++) {
float yDirAdj = textPositions.get(i).getYDirAdj();
if (yDirAdj != y) {
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle());
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle());
y = yDirAdj;
startIndex = i;
}
}
if (startIndex != textPositions.size()) {
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
}
}

View File

@ -53,7 +53,7 @@ public class PdfSegmentationService {
private final ImageClassificationService imageClassificationService;
private void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
public void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
try {
for (SectionText sectionText : texts) {
@ -194,9 +194,6 @@ public class PdfSegmentationService {
pdDocument = reinitializePDDocument(tempFile, pdDocument);
// This can be improved an done in one pass, but it's complicated to do right away
postProcessSections(pdDocument, document.getSectionText());
IOUtils.close(pdDocument);
tempFile.delete();