Serialization of text
This commit is contained in:
parent
42fcea85d3
commit
5c2596e268
@ -31,6 +31,12 @@ public class SectionText {
|
||||
private List<Integer> cellStarts = new ArrayList<>();
|
||||
|
||||
|
||||
public void setTabularData(Map<String, CellValue> tabularData) {
|
||||
tabularData.remove(null);
|
||||
this.tabularData = tabularData;
|
||||
}
|
||||
|
||||
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
SearchableText searchableText = new SearchableText();
|
||||
|
||||
@ -6,6 +6,7 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -13,6 +14,7 @@ import java.util.List;
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class TextBlock extends AbstractTextContainer {
|
||||
|
||||
@Builder.Default
|
||||
|
||||
@ -0,0 +1,43 @@
|
||||
package com.iqser.red.service.redaction.v1.server.configuration;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonGenerator;
|
||||
import com.fasterxml.jackson.core.Version;
|
||||
import com.fasterxml.jackson.databind.JsonSerializer;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.SerializerProvider;
|
||||
import com.fasterxml.jackson.databind.module.SimpleModule;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@Configuration
|
||||
public class ObjectMapperConfiguration {
|
||||
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public ObjectMapper objectMapper() {
|
||||
var objectMapper = new ObjectMapper();
|
||||
SimpleModule simpleModule = new SimpleModule("SimpleModule",
|
||||
new Version(1, 0, 0, null));
|
||||
// simpleModule.addSerializer(new ItemSerializer());
|
||||
simpleModule.addSerializer(PDFont.class, new PDFontSerializer());
|
||||
simpleModule.addSerializer(PDTrueTypeFont.class, new PDFontSerializer());
|
||||
objectMapper.registerModule(simpleModule);
|
||||
|
||||
return objectMapper;
|
||||
}
|
||||
|
||||
|
||||
public static class PDFontSerializer extends JsonSerializer<PDFont> {
|
||||
|
||||
@Override
|
||||
public void serialize(PDFont t, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException {
|
||||
jsonGenerator.writeNull();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
|
||||
@Data
|
||||
public class RedMatrix {
|
||||
|
||||
private float[] single;
|
||||
}
|
||||
@ -0,0 +1,48 @@
|
||||
package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
|
||||
@Data
|
||||
public class RedTextPosition {
|
||||
|
||||
private Matrix textMatrix;
|
||||
private float endX;
|
||||
private float endY;
|
||||
private float maxHeight;
|
||||
private int rotation;
|
||||
private float x;
|
||||
private float y;
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
private float widthOfSpace;
|
||||
private int[] charCodes;
|
||||
private float fontSize;
|
||||
private float fontSizePt;
|
||||
private float[] widths;
|
||||
private String unicode;
|
||||
private float direction = -1.0F;
|
||||
private float XDirAdj;
|
||||
private float YDirAdj;
|
||||
private float width;
|
||||
private float heightDir;
|
||||
private float fontSizeInPt;
|
||||
private String fontName;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
var pos = new RedTextPosition();
|
||||
BeanUtils.copyProperties(textPosition, pos);
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
|
||||
pos.setCharCodes(textPosition.getCharacterCodes());
|
||||
pos.setWidths(textPosition.getIndividualWidths());
|
||||
pos.setFontSizePt(textPosition.getFontSizeInPt());
|
||||
|
||||
return pos;
|
||||
}
|
||||
}
|
||||
@ -3,26 +3,45 @@ package com.iqser.red.service.redaction.v1.server.parsing.model;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
|
||||
private final int page;
|
||||
private List<TextPosition> textPositions = new ArrayList<>();
|
||||
private int page;
|
||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
|
||||
public TextPositionSequence(int page) {
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
public static TextPositionSequence fromData(List<RedTextPosition> textPositions, int page) {
|
||||
var textPositionSequence = new TextPositionSequence();
|
||||
textPositionSequence.textPositions = textPositions;
|
||||
textPositionSequence.page = page;
|
||||
|
||||
return textPositionSequence;
|
||||
}
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions;
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
public void setTextPositions(List<TextPosition> textPositions) {
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
@ -34,7 +53,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
RedTextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return text.charAt(0);
|
||||
}
|
||||
@ -42,7 +61,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
public char charAt(int index, boolean caseInSensitive) {
|
||||
|
||||
TextPosition textPosition = textPositionAt(index);
|
||||
RedTextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
|
||||
}
|
||||
@ -51,7 +70,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
@Override
|
||||
public TextPositionSequence subSequence(int start, int end) {
|
||||
|
||||
return new TextPositionSequence(textPositions.subList(start, end), page);
|
||||
return fromData(textPositions.subList(start, end), page);
|
||||
}
|
||||
|
||||
|
||||
@ -66,18 +85,24 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public TextPosition textPositionAt(int index) {
|
||||
public RedTextPosition textPositionAt(int index) {
|
||||
|
||||
return textPositions.get(index);
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
public void add(RedTextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(textPosition);
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
||||
}
|
||||
|
||||
|
||||
public float getX1() {
|
||||
|
||||
if (textPositions.get(0).getRotation() == 90) {
|
||||
@ -145,9 +170,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
public String getFont() {
|
||||
|
||||
return textPositions.get(0)
|
||||
.getFont()
|
||||
.toString()
|
||||
return textPositions.get(0).getFontName()
|
||||
.toLowerCase()
|
||||
.replaceAll(",bold", "")
|
||||
.replaceAll(",italic", "");
|
||||
@ -156,7 +179,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
public String getFontStyle() {
|
||||
|
||||
String lowercaseFontName = textPositions.get(0).getFont().toString().toLowerCase();
|
||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
||||
|
||||
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
|
||||
return "bold, italic";
|
||||
|
||||
@ -13,11 +13,14 @@ import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationSer
|
||||
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
@ -53,6 +56,18 @@ public class ReanalyzeService {
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
|
||||
|
||||
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest.getProjectId(), analyzeRequest.getFileId(), FileType.ORIGIN));
|
||||
|
||||
// TODO move this to where it makes sense - or remove completly
|
||||
try (PDDocument pdDocument = PDDocument.load(storedObjectStream, MemoryUsageSetting.setupTempFileOnly())) {
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
pdfSegmentationService.postProcessSections(pdDocument, classifiedDoc.getSectionText());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
redactionLogCreatorService.createRedactionLog(classifiedDoc, pageCount, analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getRuleSetId());
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ import com.iqser.red.service.redaction.v1.model.*;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
@ -14,7 +15,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -272,24 +272,24 @@ public class RedactionLogCreatorService {
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle> getRectanglesPerLine(List<TextPosition> textPositions, int page) {
|
||||
private List<Rectangle> getRectanglesPerLine(List<RedTextPosition> textPositions, int page) {
|
||||
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
if (textPositions.size() == 1) {
|
||||
rectangles.add(new TextPositionSequence(textPositions, page).getRectangle());
|
||||
rectangles.add( TextPositionSequence.fromData(textPositions, page).getRectangle());
|
||||
} else {
|
||||
float y = textPositions.get(0).getYDirAdj();
|
||||
int startIndex = 0;
|
||||
for (int i = 1; i < textPositions.size(); i++) {
|
||||
float yDirAdj = textPositions.get(i).getYDirAdj();
|
||||
if (yDirAdj != y) {
|
||||
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, i), page).getRectangle());
|
||||
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, i), page).getRectangle());
|
||||
y = yDirAdj;
|
||||
startIndex = i;
|
||||
}
|
||||
}
|
||||
if (startIndex != textPositions.size()) {
|
||||
rectangles.add(new TextPositionSequence(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
|
||||
rectangles.add( TextPositionSequence.fromData(textPositions.subList(startIndex, textPositions.size()), page).getRectangle());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ public class PdfSegmentationService {
|
||||
private final ImageClassificationService imageClassificationService;
|
||||
|
||||
|
||||
private void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
|
||||
public void postProcessSections(PDDocument pdDocument, List<SectionText> texts) {
|
||||
|
||||
try {
|
||||
for (SectionText sectionText : texts) {
|
||||
@ -194,9 +194,6 @@ public class PdfSegmentationService {
|
||||
|
||||
pdDocument = reinitializePDDocument(tempFile, pdDocument);
|
||||
|
||||
// This can be improved an done in one pass, but it's complicated to do right away
|
||||
postProcessSections(pdDocument, document.getSectionText());
|
||||
|
||||
IOUtils.close(pdDocument);
|
||||
|
||||
tempFile.delete();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user