Pull request #12: RED-6280: Performance Test Issue with OCR-Service
Merge in RED/ocr-service from RED-6280 to master * commit '742725834933ad74ad582366b2b62015524bedb3': RED-6280: Performance Test Issue with OCR-Service *removed init/terminate calls again *manual memory cleanup at every opportunity
This commit is contained in:
commit
a4ca2db37d
@ -6,6 +6,8 @@ import lombok.SneakyThrows;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.PostConstruct;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class PDFNetInitializer {
|
||||
@ -18,14 +20,12 @@ public class PDFNetInitializer {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@PostConstruct
|
||||
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
||||
public void init() {
|
||||
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
PDFNet.setTempPath("/tmp/pdftron");
|
||||
PDFNet.addResourceSearchPath(ocrModulePath);
|
||||
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -47,16 +47,17 @@ public class ImagePositionRetrievalService {
|
||||
ElementReader reader = new ElementReader();
|
||||
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
|
||||
RectCollection imagePositions = new RectCollection();
|
||||
reader.begin(pdfDoc.getPage(pageId));
|
||||
|
||||
reader.begin(pdfDoc.getPage(pageId));
|
||||
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
|
||||
imagePositions = mergeOverlappingRects(imagePositions);
|
||||
|
||||
reader.end();
|
||||
|
||||
if (imagePositions.getNumRects() > 0) {
|
||||
pageIdToImagePositions.put(pageId, imagePositions);
|
||||
}
|
||||
}
|
||||
reader.destroy();
|
||||
return pageIdToImagePositions;
|
||||
}
|
||||
|
||||
|
||||
@ -77,7 +77,6 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
|
||||
|
||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||
.reader(reader)
|
||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||
@ -92,8 +91,20 @@ public class InvisibleElementRemovalService {
|
||||
context.visitedXObjIds().clear();
|
||||
|
||||
removeOverlappedElements(page, writer, context);
|
||||
reader.end();
|
||||
writer.end();
|
||||
}
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
|
||||
|
||||
try {
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception e) {
|
||||
log.error("File could not be saved after invisible element removal");
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
writer.destroy();
|
||||
reader.destroy();
|
||||
pdfDoc.close();
|
||||
}
|
||||
|
||||
|
||||
@ -216,13 +227,14 @@ public class InvisibleElementRemovalService {
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processElements(formWriter, context);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
context.reader().end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
PathData pathData = pathElement.getPathData();
|
||||
|
||||
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
|
||||
@ -352,7 +364,7 @@ public class InvisibleElementRemovalService {
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processOverlappedElements(formWriter, context);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
context.reader().end();
|
||||
}
|
||||
}
|
||||
@ -427,6 +439,9 @@ public class InvisibleElementRemovalService {
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
writer.writePlacedElement(rect);
|
||||
|
||||
colorPt.destroy();
|
||||
eb.destroy();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -98,6 +98,7 @@ public class OCRService {
|
||||
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
|
||||
|
||||
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
|
||||
|
||||
singlePagePdfDoc.close();
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
|
||||
@ -1,26 +1,23 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
|
||||
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
|
||||
import feign.FeignException;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -30,7 +27,6 @@ public class OcrMessageReceiver {
|
||||
private final ObjectMapper objectMapper;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||
private final PDFNetInitializer pdfNetInitializer;
|
||||
|
||||
private final OCRService ocrService;
|
||||
|
||||
@ -39,7 +35,6 @@ public class OcrMessageReceiver {
|
||||
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
|
||||
public void receiveOcr(String in) throws JsonProcessingException {
|
||||
|
||||
pdfNetInitializer.init();
|
||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
|
||||
|
||||
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
@ -62,8 +57,6 @@ public class OcrMessageReceiver {
|
||||
}
|
||||
|
||||
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
PDFNet.terminate();
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,24 +1,28 @@
|
||||
package com.iqser.red.service.ocr.v1.server;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
|
||||
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import lombok.SneakyThrows;
|
||||
import org.junit.jupiter.api.*;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@ -28,19 +32,14 @@ public class AbstractTest {
|
||||
@Autowired
|
||||
protected StorageService storageService;
|
||||
|
||||
@Autowired
|
||||
private PDFNetInitializer pdfNetInitializer;
|
||||
@MockBean
|
||||
protected RabbitTemplate rabbitTemplate;
|
||||
|
||||
@BeforeEach
|
||||
@SneakyThrows
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void initPDFNet() {
|
||||
pdfNetInitializer.init();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void terminatePDFNet() {
|
||||
PDFNet.terminate();
|
||||
System.out.println("PDFNet Terminated");
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,33 +1,29 @@
|
||||
package com.iqser.red.service.ocr.v1.server;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.*;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
|
||||
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||
import io.micrometer.prometheus.PrometheusTimer;
|
||||
import lombok.SneakyThrows;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
|
||||
public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@ -35,9 +31,6 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@Autowired
|
||||
protected ObjectMapper objectMapper;
|
||||
|
||||
@MockBean
|
||||
protected RabbitTemplate rabbitTemplate;
|
||||
|
||||
@Autowired
|
||||
private OCRService ocrService;
|
||||
|
||||
@ -139,25 +132,5 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
return extractAllTextFromDocument(fileStream);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
TextExtractor extractor = new TextExtractor();
|
||||
List<String> texts = new ArrayList<>();
|
||||
|
||||
PageIterator iterator = pdfDoc.getPageIterator();
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
extractor.begin(page);
|
||||
texts.add(extractor.getAsText());
|
||||
}
|
||||
|
||||
return String.join("\n", texts);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,15 +1,9 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.*;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
import lombok.SneakyThrows;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawGrid;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawRectCollection;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
@ -20,8 +14,18 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
|
||||
class ImagePositionRetrievalServiceTest extends AbstractTest {
|
||||
@ -29,10 +33,6 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
|
||||
@Autowired
|
||||
private ImagePositionRetrievalService imagePositionRetrievalService;
|
||||
|
||||
@MockBean
|
||||
protected RabbitTemplate rabbitTemplate;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testImagePositionRetrievalForRotateTestFileWithImages() {
|
||||
@ -116,37 +116,38 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
|
||||
|
||||
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
|
||||
|
||||
InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath());
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
try (InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath())) {
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
|
||||
try {
|
||||
writer.begin(pdfDoc.getPage(pageId));
|
||||
drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId));
|
||||
drawGrid(writer, pdfDoc.getPage(pageId));
|
||||
writer.end();
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
|
||||
var r = rectCollection.getRectAt(j);
|
||||
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
ElementWriter writer = new ElementWriter();
|
||||
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
|
||||
try {
|
||||
writer.begin(pdfDoc.getPage(pageId));
|
||||
drawRectCollection(writer, rectCollection);
|
||||
drawGrid(writer, pdfDoc.getPage(pageId));
|
||||
writer.end();
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
|
||||
var r = rectCollection.getRectAt(j);
|
||||
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
}
|
||||
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Check visually for red Rectangles to match images in the saved pdf file
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
|
||||
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
|
||||
// Check visually for red Rectangles to match images in the saved pdf file
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
|
||||
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
|
||||
}
|
||||
pdfDoc.close();
|
||||
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
|
||||
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
|
||||
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
|
||||
}
|
||||
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
|
||||
fileStream.close();
|
||||
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
|
||||
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -161,59 +162,4 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
|
||||
return coords;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawGrid(ElementWriter writer, Page page) {
|
||||
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
double dX = 15;
|
||||
double dY = 15;
|
||||
int nRows = (int) (page.getPageHeight() / dY) + 1;
|
||||
int nCols = (int) (page.getPageWidth() / dX) + 1;
|
||||
for (int row = 0; row < nRows; ++row) {
|
||||
for (int col = 0; col < nCols; ++col) {
|
||||
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
|
||||
cell.setPathStroke(true);
|
||||
cell.getGState().setLineWidth(1);
|
||||
cell.getGState().setStrokeOpacity(0.1);
|
||||
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
if (row == 0 && col == 0) {
|
||||
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
|
||||
cell.setPathFill(true);
|
||||
cell.getGState().setFillOpacity(0.8);
|
||||
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
|
||||
} else {
|
||||
cell.setPathFill(false);
|
||||
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
|
||||
}
|
||||
writer.writePlacedElement(cell);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) {
|
||||
|
||||
ColorPt colorPt = new ColorPt(1, 0, 0);
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
Rect r = rectCollection.getRectAt(i);
|
||||
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
|
||||
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setLineWidth(5);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
|
||||
rect.setPathFill(true);
|
||||
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setFillColor(colorPt);
|
||||
rect.getGState().setFillOpacity(0.5);
|
||||
|
||||
writer.writePlacedElement(rect);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,31 +1,25 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
import lombok.SneakyThrows;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class InvisibleElementRemovalServiceTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
private InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
@MockBean
|
||||
protected RabbitTemplate rabbitTemplate;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@ -36,27 +30,18 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
|
||||
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false);
|
||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, false);
|
||||
}
|
||||
|
||||
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true);
|
||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, true);
|
||||
}
|
||||
|
||||
System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
|
||||
System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
|
||||
TextExtractor extractor = new TextExtractor();
|
||||
|
||||
PDFDoc pdfDoc;
|
||||
try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
pdfDoc = new PDFDoc(fileStream);
|
||||
}
|
||||
|
||||
PageIterator iterator = pdfDoc.getPageIterator();
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
extractor.begin(page);
|
||||
String[] text = extractor.getAsText().split("\n");
|
||||
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
String[] text = extractAllTextFromDocument(fileStream).split("\n");
|
||||
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,74 @@
|
||||
package com.iqser.red.service.ocr.v1.server.utils;
|
||||
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
|
||||
public class PdfDraw {
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawGrid(ElementWriter writer, Page page) {
|
||||
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
double dX = 15;
|
||||
double dY = 15;
|
||||
int nRows = (int) (page.getPageHeight() / dY) + 1;
|
||||
int nCols = (int) (page.getPageWidth() / dX) + 1;
|
||||
for (int row = 0; row < nRows; ++row) {
|
||||
for (int col = 0; col < nCols; ++col) {
|
||||
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
|
||||
cell.setPathStroke(true);
|
||||
cell.getGState().setLineWidth(1);
|
||||
cell.getGState().setStrokeOpacity(0.1);
|
||||
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
if (row == 0 && col == 0) {
|
||||
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
|
||||
cell.setPathFill(true);
|
||||
cell.getGState().setFillOpacity(0.8);
|
||||
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
|
||||
} else {
|
||||
cell.setPathFill(false);
|
||||
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
|
||||
}
|
||||
writer.writePlacedElement(cell);
|
||||
}
|
||||
}
|
||||
eb.destroy();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
|
||||
|
||||
ColorPt colorPt = new ColorPt(1, 0, 0);
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
Rect r = rectCollection.getRectAt(i);
|
||||
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
|
||||
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setLineWidth(5);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
|
||||
rect.setPathFill(true);
|
||||
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setFillColor(colorPt);
|
||||
rect.getGState().setFillOpacity(0.5);
|
||||
|
||||
writer.writePlacedElement(rect);
|
||||
}
|
||||
colorPt.destroy();
|
||||
eb.destroy();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,35 @@
|
||||
package com.iqser.red.service.ocr.v1.server.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
|
||||
|
||||
public class PdfTextExtraction {
|
||||
|
||||
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
TextExtractor extractor = new TextExtractor();
|
||||
List<String> texts = new ArrayList<>();
|
||||
|
||||
PageIterator iterator = pdfDoc.getPageIterator();
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
extractor.begin(page);
|
||||
texts.add(extractor.getAsText());
|
||||
}
|
||||
|
||||
extractor.destroy();
|
||||
pdfDoc.close();
|
||||
return String.join("\n", texts);
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user