RED-6126: Performance Tests
*moved to streams for pdf file transfer *disabled overlap detection
This commit is contained in:
parent
b0a658213d
commit
0894d73216
@ -5,6 +5,7 @@ import java.awt.geom.AffineTransform;
|
|||||||
import java.awt.geom.GeneralPath;
|
import java.awt.geom.GeneralPath;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -59,10 +60,10 @@ public class InvisibleElementRemovalService {
|
|||||||
* @param pdfFile The PDF file to process
|
* @param pdfFile The PDF file to process
|
||||||
* @param delta If this flag is set only the removed Elements will be written to the output file.
|
* @param delta If this flag is set only the removed Elements will be written to the output file.
|
||||||
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
|
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
|
||||||
* @return The resulting PDF File as bytes.
|
* @param out OutputStream to write the resulting file to
|
||||||
**/
|
**/
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public byte[] removeInvisibleElements(InputStream pdfFile, boolean delta) {
|
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
|
||||||
|
|
||||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||||
|
|
||||||
@ -88,9 +89,9 @@ public class InvisibleElementRemovalService {
|
|||||||
|
|
||||||
context.visitedXObjIds().clear();
|
context.visitedXObjIds().clear();
|
||||||
|
|
||||||
removeOverlappedElements(page, writer, context);
|
// removeOverlappedElements(page, writer, context);
|
||||||
}
|
}
|
||||||
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null);
|
pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -225,7 +226,7 @@ public class InvisibleElementRemovalService {
|
|||||||
|
|
||||||
//transform path to initial user space
|
//transform path to initial user space
|
||||||
var ctm = pathElement.getCTM();
|
var ctm = pathElement.getCTM();
|
||||||
var affineTransform = getAffineTransform(ctm);
|
var affineTransform = toAffineTransform(ctm);
|
||||||
linePath.transform(affineTransform);
|
linePath.transform(affineTransform);
|
||||||
|
|
||||||
var rect = linePath.getBounds2D();
|
var rect = linePath.getBounds2D();
|
||||||
@ -244,8 +245,13 @@ public class InvisibleElementRemovalService {
|
|||||||
writer.writeElement(pathElement);
|
writer.writeElement(pathElement);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
if (pathElement.isWindingFill()) {
|
||||||
|
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||||
|
} else {
|
||||||
|
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||||
|
}
|
||||||
|
|
||||||
if (inClippingPath) {
|
if (inClippingPath) {
|
||||||
// TODO: WINDING RULE
|
|
||||||
if (isFilledAndNonTransparent(pathElement)) {
|
if (isFilledAndNonTransparent(pathElement)) {
|
||||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
|
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
|
||||||
.stream()
|
.stream()
|
||||||
@ -270,12 +276,6 @@ public class InvisibleElementRemovalService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
|
|
||||||
|
|
||||||
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
|
||||||
context.reader().begin(page);
|
context.reader().begin(page);
|
||||||
@ -422,6 +422,12 @@ public class InvisibleElementRemovalService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException {
|
||||||
|
|
||||||
|
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Builder
|
@Builder
|
||||||
private record InvisibleElementRemovalContext(
|
private record InvisibleElementRemovalContext(
|
||||||
boolean delta,
|
boolean delta,
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.service;
|
package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
@ -54,31 +56,32 @@ public class OCRService {
|
|||||||
*
|
*
|
||||||
* @param dossierId The dossier id
|
* @param dossierId The dossier id
|
||||||
* @param fileId The file id
|
* @param fileId The file id
|
||||||
* @return the resulting PDF file as an InputStream
|
* @param out OutputStream to write the file to
|
||||||
*/
|
*/
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public InputStream runOcrOnDocument(String dossierId, String fileId) {
|
public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) {
|
||||||
|
|
||||||
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
|
||||||
|
|
||||||
byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
|
|
||||||
|
|
||||||
byte[] ocrBytes = runOcr(fileWithoutInvisibleTextBytes, fileId);
|
|
||||||
|
|
||||||
return new ByteArrayInputStream(ocrBytes);
|
|
||||||
|
|
||||||
|
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
||||||
|
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||||
|
}
|
||||||
|
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||||
|
runOcr(transferInputStream, out, fileId);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private byte[] runOcr(byte[] file, String fileId) {
|
private void runOcr(InputStream fileStream, OutputStream out, String fileId) {
|
||||||
|
|
||||||
PDFDoc pdfDoc = new PDFDoc(file);
|
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||||
|
|
||||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
|
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
|
||||||
|
|
||||||
OCROptions options = new OCROptions();
|
OCROptions options = new OCROptions();
|
||||||
PDFDoc ocrPageDoc = new PDFDoc();
|
PDFDoc ocrPageDoc = new PDFDoc();
|
||||||
|
|
||||||
int numProcessedPages = 0;
|
int numProcessedPages = 0;
|
||||||
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
||||||
try {
|
try {
|
||||||
@ -130,6 +133,7 @@ public class OCRService {
|
|||||||
.build()));
|
.build()));
|
||||||
|
|
||||||
Optimizer.optimize(pdfDoc);
|
Optimizer.optimize(pdfDoc);
|
||||||
return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null);
|
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,9 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.service;
|
package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
||||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||||
@ -8,9 +12,9 @@ import org.springframework.stereotype.Service;
|
|||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
|
||||||
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||||
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
|
|
||||||
|
|
||||||
import feign.FeignException;
|
import feign.FeignException;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -44,9 +48,14 @@ public class OcrMessageReceiver {
|
|||||||
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
|
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
var ocrResult = ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
try (var out = new ByteArrayOutputStream()) {
|
||||||
|
ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), out);
|
||||||
|
|
||||||
|
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), new ByteArrayInputStream(out.toByteArray()));
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult);
|
|
||||||
|
|
||||||
long end = System.currentTimeMillis();
|
long end = System.currentTimeMillis();
|
||||||
log.info("Successfully processed ocr for file with dossierId {} and fileId {}, took {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), end - start);
|
log.info("Successfully processed ocr for file with dossierId {} and fileId {}, took {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), end - start);
|
||||||
|
|||||||
@ -3,8 +3,8 @@ package com.iqser.red.service.ocr.v1.server;
|
|||||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -134,25 +134,29 @@ public class OcrServiceIntegrationTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||||
|
|
||||||
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
|
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
|
||||||
storageService.storeObject(originId, pdfFileResource.getInputStream());
|
try (var fileStream = pdfFileResource.getInputStream()) {
|
||||||
|
storageService.storeObject(originId, fileStream);
|
||||||
try (InputStream ocrDocument = ocrService.runOcrOnDocument("dossier", "file")) {
|
|
||||||
byte[] ocrDocumentBytes = ocrDocument.readAllBytes();
|
|
||||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
|
||||||
out.write(ocrDocumentBytes);
|
|
||||||
}
|
|
||||||
TextExtractor extractor = new TextExtractor();
|
|
||||||
List<String> texts = new ArrayList<>();
|
|
||||||
PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes);
|
|
||||||
PageIterator iterator = pdfDoc.getPageIterator();
|
|
||||||
while (iterator.hasNext()) {
|
|
||||||
Page page = iterator.next();
|
|
||||||
extractor.begin(page);
|
|
||||||
texts.add(extractor.getAsText());
|
|
||||||
}
|
|
||||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
|
||||||
return String.join("\n", texts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||||
|
ocrService.runOcrOnDocument("dossier", "file", out);
|
||||||
|
}
|
||||||
|
|
||||||
|
TextExtractor extractor = new TextExtractor();
|
||||||
|
List<String> texts = new ArrayList<>();
|
||||||
|
PDFDoc pdfDoc;
|
||||||
|
|
||||||
|
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||||
|
pdfDoc = new PDFDoc(fileStream);
|
||||||
|
}
|
||||||
|
PageIterator iterator = pdfDoc.getPageIterator();
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
Page page = iterator.next();
|
||||||
|
extractor.begin(page);
|
||||||
|
texts.add(extractor.getAsText());
|
||||||
|
}
|
||||||
|
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||||
|
return String.join("\n", texts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -3,9 +3,8 @@ package com.iqser.red.service.ocr.v1.server.service;
|
|||||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
@ -46,22 +45,23 @@ public class InvisibleElementRemovalServiceTest {
|
|||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||||
|
|
||||||
var initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath());
|
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||||
var fileWithoutInvisibleElements = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, false);
|
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false);
|
||||||
|
}
|
||||||
|
|
||||||
initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath());
|
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
|
||||||
var deltaFile = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, true);
|
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true);
|
||||||
|
}
|
||||||
|
|
||||||
String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf";
|
System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
|
||||||
String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf";
|
System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
|
||||||
|
|
||||||
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleElements);
|
|
||||||
saveToFile(deltaFileLocation, deltaFile);
|
|
||||||
|
|
||||||
System.out.println("Output File without invisible elements: " + fileWithoutInvisibleTextLocation);
|
|
||||||
System.out.println("Output Delta File: " + deltaFileLocation);
|
|
||||||
TextExtractor extractor = new TextExtractor();
|
TextExtractor extractor = new TextExtractor();
|
||||||
PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleElements);
|
|
||||||
|
PDFDoc pdfDoc;
|
||||||
|
try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||||
|
pdfDoc = new PDFDoc(fileStream);
|
||||||
|
}
|
||||||
|
|
||||||
PageIterator iterator = pdfDoc.getPageIterator();
|
PageIterator iterator = pdfDoc.getPageIterator();
|
||||||
while (iterator.hasNext()) {
|
while (iterator.hasNext()) {
|
||||||
Page page = iterator.next();
|
Page page = iterator.next();
|
||||||
@ -70,16 +70,4 @@ public class InvisibleElementRemovalServiceTest {
|
|||||||
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void saveToFile(String location, byte[] fileBytes) {
|
|
||||||
|
|
||||||
try (var f_out = new FileOutputStream(location)) {
|
|
||||||
f_out.write(fileBytes);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user