RED-6126: performance-test

*re-enabled overlap detection
*re-creating helper document for every page instead of reusing and adding/removing pages
This commit is contained in:
Kilian Schuettler 2023-02-09 11:22:39 +01:00
parent e705f869fd
commit 6ccf3f80fc
3 changed files with 26 additions and 10 deletions

View File

@ -89,7 +89,7 @@ public class InvisibleElementRemovalService {
context.visitedXObjIds().clear();
// removeOverlappedElements(page, writer, context);
removeOverlappedElements(page, writer, context);
}
pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
}

View File

@ -6,7 +6,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Map;
import io.micrometer.core.annotation.Timed;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
@ -23,6 +22,7 @@ import com.pdftron.pdf.Page;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@ -81,15 +81,16 @@ public class OCRService {
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
OCROptions options = new OCROptions();
PDFDoc ocrPageDoc = new PDFDoc();
int numProcessedPages = 0;
for (Integer pageId : pageIdToRectCollection.keySet()) {
try {
// optimization by only scanning pages that contain images
// optimization: creating a new document is faster than reusing the same and adding/removing pages one by one
OCROptions options = new OCROptions();
PDFDoc ocrPageDoc = new PDFDoc();
// optimization: only scanning pages that contain images
Page pdfPage = pdfDoc.getPage(pageId);
pdfPage.setMediaBox(pdfPage.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron
// optimization: this line ensures the ocr text is placed correctly by PDFTron
pdfPage.setMediaBox(pdfPage.getCropBox());
ocrPageDoc.pagePushBack(pdfPage);
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
options.addLang(ENGLISH);
@ -109,7 +110,7 @@ public class OCRService {
Page ocrPage = ocrPageDoc.getPage(1);
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
ocrPageDoc.pageRemove(ocrPageDoc.getPageIterator(1));
ocrPageDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
@ -124,8 +125,6 @@ public class OCRService {
}
}
ocrPageDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
@ -137,4 +136,5 @@ public class OCRService {
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
}

View File

@ -0,0 +1,16 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="warn">
<AppenderRef ref="CONSOLE"/>
</Root>
<Logger name="com.iqser" level="info"/>
</Loggers>
</Configuration>