From 3e53e156a5b1a8757b42621f2a684c5f5255010a Mon Sep 17 00:00:00 2001 From: Philipp Schramm Date: Mon, 25 Jul 2022 16:04:51 +0200 Subject: [PATCH] RED-4771: Reformatted code --- .../src/main/java/buildjob/PlanSpec.java | 14 +- .../v1/server/client/MockMultipartFile.java | 14 +- .../v1/server/client/model/NerEntities.java | 4 +- .../v1/server/parsing/PDFTextStripper.java | 1250 ++++++++--------- .../parsing/model/TextPositionSequence.java | 63 +- .../queue/RedactionMessageReceiver.java | 15 +- .../v1/server/redaction/model/Section.java | 12 +- .../service/EntityRedactionService.java | 20 +- .../redaction/utils/EntitySearchUtils.java | 64 +- .../segmentation/PdfSegmentationService.java | 48 +- .../v1/server/RedactionIntegrationTest.java | 120 +- .../realdata/LiveDataIntegrationTest.java | 77 +- .../PdfSegmentationServiceTest.java | 69 +- 13 files changed, 839 insertions(+), 931 deletions(-) diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index f90db7ac..451c1f13 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -1,10 +1,12 @@ package buildjob; -import java.time.DayOfWeek; +import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask; + import java.time.LocalTime; import com.atlassian.bamboo.specs.api.BambooSpec; import com.atlassian.bamboo.specs.api.builders.BambooKey; +import com.atlassian.bamboo.specs.api.builders.Variable; import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration; import com.atlassian.bamboo.specs.api.builders.permission.PermissionType; import com.atlassian.bamboo.specs.api.builders.permission.Permissions; @@ -17,20 +19,15 @@ import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup; import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement; import com.atlassian.bamboo.specs.api.builders.project.Project; import com.atlassian.bamboo.specs.builders.task.CheckoutItem; -import com.atlassian.bamboo.specs.api.builders.Variable; import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask; import com.atlassian.bamboo.specs.builders.task.ScriptTask; import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask; import com.atlassian.bamboo.specs.builders.task.VcsTagTask; import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger; -import com.atlassian.bamboo.specs.builders.trigger.RepositoryPollingTrigger; import com.atlassian.bamboo.specs.builders.trigger.ScheduledTrigger; import com.atlassian.bamboo.specs.model.task.InjectVariablesScope; -import com.atlassian.bamboo.specs.util.BambooServer; -import com.atlassian.bamboo.specs.builders.task.ScriptTask; import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location; - -import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask; +import com.atlassian.bamboo.specs.util.BambooServer; /** * Plan configuration for Bamboo. @@ -45,10 +42,11 @@ public class PlanSpec { private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", ""); + /** * Run main to publish plan on Bamboo */ - public static void main(final String[] args) throws Exception { + public static void main(final String[] args) { //By default credentials are read from the '.credentials' file. BambooServer bambooServer = new BambooServer("http://localhost:8085"); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/MockMultipartFile.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/MockMultipartFile.java index 0951ffcd..81fda12f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/MockMultipartFile.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/MockMultipartFile.java @@ -1,16 +1,16 @@ package com.iqser.red.service.redaction.v1.server.client; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; + import org.springframework.lang.NonNull; import org.springframework.lang.Nullable; import org.springframework.util.Assert; import org.springframework.util.FileCopyUtils; import org.springframework.web.multipart.MultipartFile; -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; - public class MockMultipartFile implements MultipartFile { private final String name; @@ -82,13 +82,13 @@ public class MockMultipartFile implements MultipartFile { } - public byte[] getBytes() throws IOException { + public byte[] getBytes() { return this.content; } - public InputStream getInputStream() throws IOException { + public InputStream getInputStream() { return new ByteArrayInputStream(this.content); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntities.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntities.java index cc628f4b..466cc3aa 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntities.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntities.java @@ -5,8 +5,8 @@ import java.util.List; import java.util.Map; import com.dslplatform.json.CompiledJson; + import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; import lombok.NoArgsConstructor; @@ -16,6 +16,6 @@ import lombok.NoArgsConstructor; @AllArgsConstructor public class NerEntities { - private Map> data = new HashMap<>(); + private Map> data = new HashMap<>(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java index aa6d531c..90408c99 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java @@ -54,10 +54,9 @@ import org.apache.pdfbox.util.QuickSort; /** * This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox. * see S416.pdf - * */ + */ @SuppressWarnings({"PMD", "checkstyle:all"}) -public class PDFTextStripper extends LegacyPDFStreamEngine -{ +public class PDFTextStripper extends LegacyPDFStreamEngine { private static float defaultIndentThreshold = 2.0f; private static float defaultDropThreshold = 2.5f; @@ -69,70 +68,52 @@ public class PDFTextStripper extends LegacyPDFStreamEngine // with -D system properties: // pdftextstripper.indent // pdftextstripper.drop - static - { + static { String strDrop = null, strIndent = null; - try - { + try { String className = PDFTextStripper.class.getSimpleName().toLowerCase(); String prop = className + ".indent"; strIndent = System.getProperty(prop); prop = className + ".drop"; strDrop = System.getProperty(prop); - } - catch (SecurityException e) - { + } catch (SecurityException e) { // PDFBOX-1946 when run in an applet // ignore and use default } - if (strIndent != null && strIndent.length() > 0) - { - try - { + if (strIndent != null && strIndent.length() > 0) { + try { defaultIndentThreshold = Float.parseFloat(strIndent); - } - catch (NumberFormatException nfe) - { + } catch (NumberFormatException nfe) { // ignore and use default } } - if (strDrop != null && strDrop.length() > 0) - { - try - { + if (strDrop != null && strDrop.length() > 0) { + try { defaultDropThreshold = Float.parseFloat(strDrop); - } - catch (NumberFormatException nfe) - { + } catch (NumberFormatException nfe) { // ignore and use default } } } - - static - { + + + static { // check if we need to use the custom quicksort algorithm as a // workaround to the PDFBOX-1512 transitivity issue of TextPositionComparator: boolean is16orLess = false; - try - { + try { String version = System.getProperty("java.specification.version"); StringTokenizer st = new StringTokenizer(version, "."); int majorVersion = Integer.parseInt(st.nextToken()); int minorVersion = 0; - if (st.hasMoreTokens()) - { + if (st.hasMoreTokens()) { minorVersion = Integer.parseInt(st.nextToken()); } is16orLess = majorVersion == 1 && minorVersion <= 6; - } - catch (SecurityException x) - { + } catch (SecurityException x) { // when run in an applet ignore and use default // assume 1.7 or higher so that quicksort is used - } - catch (NumberFormatException nfe) - { + } catch (NumberFormatException nfe) { // should never happen, but if it does, // assume 1.7 or higher so that quicksort is used } @@ -182,18 +163,18 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * a newspaper, we want to extract the first column and then the second column. In this example the PDF would have 2 * beads(or articles), one for each column. The size of the charactersByArticle would be 5, because not all text on * the screen will fall into one of the articles. The five divisions are shown below - * + *

* Text before first article * first article text * text between first article and second article * second article text * text after second article - * + *

* Most PDFs won't have any beads, so charactersByArticle will contain a single entry. */ - protected ArrayList> charactersByArticle = new ArrayList>(); + protected ArrayList> charactersByArticle = new ArrayList<>(); - private Map>> characterListMapping = new HashMap>>(); + private Map>> characterListMapping = new HashMap<>(); protected PDDocument document; protected Writer output; @@ -203,19 +184,21 @@ public class PDFTextStripper extends LegacyPDFStreamEngine */ private boolean inParagraph; + /** * Instantiate a new PDFTextStripper object. * * @throws IOException If there is an error loading the properties. */ - public PDFTextStripper() throws IOException - { + public PDFTextStripper() throws IOException { + } + /** * This will return the text of a document. See writeText.
* NOTE: The document must not be encrypted when coming into this method. - * + * *

IMPORTANT: By default, text extraction is done in the same sequence as the text in the PDF page content stream. * PDF is a graphic format, not a text format, and unlike HTML, it has no requirements that text one on page * be rendered in a certain order. The order is the one that was determined by the software that created the @@ -225,39 +208,38 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * @return The text of the PDF document. * @throws IOException if the doc state is invalid or it is encrypted. */ - public String getText(PDDocument doc) throws IOException - { + public String getText(PDDocument doc) throws IOException { + StringWriter outputStream = new StringWriter(); writeText(doc, outputStream); return outputStream.toString(); } - private void resetEngine() - { + + private void resetEngine() { + currentPageNo = 0; document = null; - if (charactersByArticle != null) - { + if (charactersByArticle != null) { charactersByArticle.clear(); } characterListMapping.clear(); } + /** * This will take a PDDocument and write the text of that document to the print writer. * - * @param doc The document to get the data from. + * @param doc The document to get the data from. * @param outputStream The location to put the text. - * * @throws IOException If the doc is in an invalid state. */ - public void writeText(PDDocument doc, Writer outputStream) throws IOException - { + public void writeText(PDDocument doc, Writer outputStream) throws IOException { + resetEngine(); document = doc; output = outputStream; - if (getAddMoreFormatting()) - { + if (getAddMoreFormatting()) { paragraphEnd = lineSeparator; pageStart = lineSeparator; articleStart = lineSeparator; @@ -268,43 +250,32 @@ public class PDFTextStripper extends LegacyPDFStreamEngine endDocument(document); } + /** * This will process all of the pages and the text that is in them. * * @param pages The pages object in the document. - * * @throws IOException If there is an error parsing the text. */ - protected void processPages(PDPageTree pages) throws IOException - { - PDPage startBookmarkPage = startBookmark == null ? null - : startBookmark.findDestinationPage(document); - if (startBookmarkPage != null) - { + protected void processPages(PDPageTree pages) throws IOException { + + PDPage startBookmarkPage = startBookmark == null ? null : startBookmark.findDestinationPage(document); + if (startBookmarkPage != null) { startBookmarkPageNumber = pages.indexOf(startBookmarkPage) + 1; - } - else - { + } else { // -1 = undefined startBookmarkPageNumber = -1; } - PDPage endBookmarkPage = endBookmark == null ? null - : endBookmark.findDestinationPage(document); - if (endBookmarkPage != null) - { + PDPage endBookmarkPage = endBookmark == null ? null : endBookmark.findDestinationPage(document); + if (endBookmarkPage != null) { endBookmarkPageNumber = pages.indexOf(endBookmarkPage) + 1; - } - else - { + } else { // -1 = undefined endBookmarkPageNumber = -1; } - if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 - && endBookmark != null - && startBookmark.getCOSObject() == endBookmark.getCOSObject()) - { + if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject()) { // this is a special case where both the start and end bookmark // are the same but point to nothing. In this case // we will not extract any text. @@ -312,102 +283,87 @@ public class PDFTextStripper extends LegacyPDFStreamEngine endBookmarkPageNumber = 0; } - for (PDPage page : pages) - { + for (PDPage page : pages) { currentPageNo++; - if (page.hasContents()) - { + if (page.hasContents()) { processPage(page); } } } + /** * This method is available for subclasses of this class. It will be called before processing of the document start. * * @param document The PDF document that is being processed. - * @throws IOException If an IO error occurs. */ - protected void startDocument(PDDocument document) throws IOException - { + protected void startDocument(PDDocument document) { // no default implementation, but available for subclasses } + /** * This method is available for subclasses of this class. It will be called after processing of the document * finishes. * * @param document The PDF document that is being processed. - * @throws IOException If an IO error occurs. */ - protected void endDocument(PDDocument document) throws IOException - { + protected void endDocument(PDDocument document) { // no default implementation, but available for subclasses } + /** * This will process the contents of a page. * * @param page The page to process. - * * @throws IOException If there is an error processing the page. */ @Override - public void processPage(PDPage page) throws IOException - { - if (currentPageNo >= startPage && currentPageNo <= endPage - && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) - && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) - { + public void processPage(PDPage page) throws IOException { + + if (currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) { startPage(page); int numberOfArticleSections = 1; - if (shouldSeparateByBeads) - { + if (shouldSeparateByBeads) { fillBeadRectangles(page); numberOfArticleSections += beadRectangles.size() * 2; } int originalSize = charactersByArticle.size(); charactersByArticle.ensureCapacity(numberOfArticleSections); int lastIndex = Math.max(numberOfArticleSections, originalSize); - for (int i = 0; i < lastIndex; i++) - { - if (i < originalSize) - { + for (int i = 0; i < lastIndex; i++) { + if (i < originalSize) { charactersByArticle.get(i).clear(); - } - else - { - if (numberOfArticleSections < originalSize) - { + } else { + if (numberOfArticleSections < originalSize) { charactersByArticle.remove(i); - } - else - { - charactersByArticle.add(new ArrayList()); + } else { + charactersByArticle.add(new ArrayList<>()); } } } characterListMapping.clear(); - super.processPage(page);writePage(); + super.processPage(page); + writePage(); endPage(page); } } - private void fillBeadRectangles(PDPage page) - { - beadRectangles = new ArrayList(); - for (PDThreadBead bead : page.getThreadBeads()) - { - if (bead == null || bead.getRectangle() == null) - { + + private void fillBeadRectangles(PDPage page) { + + beadRectangles = new ArrayList<>(); + for (PDThreadBead bead : page.getThreadBeads()) { + if (bead == null || bead.getRectangle() == null) { // can't skip, because of null entry handling in processTextPosition() beadRectangles.add(null); continue; } - + PDRectangle rect = bead.getRectangle(); - + // bead rectangle is in PDF coordinates (y=0 is bottom), // glyphs are in image coordinates (y=0 is top), // so we must flip @@ -416,21 +372,21 @@ public class PDFTextStripper extends LegacyPDFStreamEngine float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY(); rect.setLowerLeftY(lowerLeftY); rect.setUpperRightY(upperRightY); - + // adjust for cropbox PDRectangle cropBox = page.getCropBox(); - if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0) - { + if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0) { rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX()); rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY()); rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX()); rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY()); } - + beadRectangles.add(rect); } } + /** * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). This * assumes that the primary direction of text is left to right. Default implementation is to do nothing. Subclasses @@ -438,11 +394,12 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * * @throws IOException If there is any error writing to the stream. */ - protected void startArticle() throws IOException - { + protected void startArticle() throws IOException { + startArticle(true); } + /** * Start a new article, which is typically defined as a column on a single page (also referred to as a bead). * Default implementation is to do nothing. Subclasses may provide additional information. @@ -450,45 +407,43 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * @param isLTR true if primary direction of text is left to right. * @throws IOException If there is any error writing to the stream. */ - protected void startArticle(boolean isLTR) throws IOException - { + protected void startArticle(boolean isLTR) throws IOException { + output.write(getArticleStart()); } + /** * End an article. Default implementation is to do nothing. Subclasses may provide additional information. * * @throws IOException If there is any error writing to the stream. */ - protected void endArticle() throws IOException - { + protected void endArticle() throws IOException { + output.write(getArticleEnd()); } + /** * Start a new page. Default implementation is to do nothing. Subclasses may provide additional information. * * @param page The page we are about to process. - * - * @throws IOException If there is any error writing to the stream. */ - protected void startPage(PDPage page) throws IOException - { + protected void startPage(PDPage page) { // default is to do nothing } + /** * End a page. Default implementation is to do nothing. Subclasses may provide additional information. * * @param page The page we are about to process. - * - * @throws IOException If there is any error writing to the stream. */ - protected void endPage(PDPage page) throws IOException - { + protected void endPage(PDPage page) { // default is to do nothing } + private static final float END_OF_LAST_TEXT_X_RESET_VALUE = -1; private static final float MAX_Y_FOR_LINE_RESET_VALUE = -Float.MAX_VALUE; private static final float EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE = -Float.MAX_VALUE; @@ -496,6 +451,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine private static final float MIN_Y_TOP_FOR_LINE_RESET_VALUE = Float.MAX_VALUE; private static final float LAST_WORD_SPACING_RESET_VALUE = -1; + /** * This will print the text of the processed page to "output". It will estimate, based on the coordinates of the * text, where newlines and word spacings should be placed. The text will be sorted only if that feature was @@ -503,8 +459,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * * @throws IOException If there is an error writing the text. */ - protected void writePage() throws IOException - { + protected void writePage() throws IOException { + float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE; @@ -515,26 +471,20 @@ public class PDFTextStripper extends LegacyPDFStreamEngine boolean startOfPage = true; // flag to indicate start of page boolean startOfArticle; - if (charactersByArticle.size() > 0) - { + if (!charactersByArticle.isEmpty()) { writePageStart(); } - for (List textList : charactersByArticle) - { - if (getSortByPosition()) - { + for (List textList : charactersByArticle) { + if (getSortByPosition()) { TextPositionComparator comparator = new TextPositionComparator(); // because the TextPositionComparator is not transitive, but // JDK7+ enforces transitivity on comparators, we need to use // a custom quicksort implementation (which is slower, unfortunately). - if (useCustomQuickSort) - { + if (useCustomQuickSort) { QuickSort.sort(textList, comparator); - } - else - { + } else { Collections.sort(textList, comparator); } } @@ -545,7 +495,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine // Now cycle through to print the text. // We queue up a line at a time before we print so that we can convert // the line from presentation form to logical form (if needed). - List line = new ArrayList(); + List line = new ArrayList<>(); Iterator textIter = textList.iterator(); // PDF files don't always store spaces. We will need to guess where we should add @@ -558,18 +508,15 @@ public class PDFTextStripper extends LegacyPDFStreamEngine // Keeps track of the previous average character width float previousAveCharWidth = -1; - while (textIter.hasNext()) - { + while (textIter.hasNext()) { TextPosition position = textIter.next(); PositionWrapper current = new PositionWrapper(position); String characterValue = position.getUnicode(); // Resets the average character width when we see a change in font // or a change in the font size - if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition() - .getFont() - || position.getFontSize() != lastPosition.getTextPosition().getFontSize())) - { + if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition().getFont() || position.getFontSize() != lastPosition.getTextPosition() + .getFontSize())) { previousAveCharWidth = -1; } @@ -578,15 +525,14 @@ public class PDFTextStripper extends LegacyPDFStreamEngine float positionWidth; float positionHeight; - // If we are sorting, then we need to use the text direction // adjusted coordinates, because they were used in the sorting. // if (getSortByPosition()) // { - positionX = position.getXDirAdj(); - positionY = position.getYDirAdj(); - positionWidth = position.getWidthDirAdj(); - positionHeight = position.getHeightDir(); + positionX = position.getXDirAdj(); + positionY = position.getYDirAdj(); + positionWidth = position.getWidthDirAdj(); + positionHeight = position.getHeightDir(); // } // else // { @@ -603,18 +549,12 @@ public class PDFTextStripper extends LegacyPDFStreamEngine // space character with some margin. float wordSpacing = position.getWidthOfSpace(); float deltaSpace; - if (wordSpacing == 0 || Float.isNaN(wordSpacing)) - { + if (wordSpacing == 0 || Float.isNaN(wordSpacing)) { deltaSpace = Float.MAX_VALUE; - } - else - { - if (lastWordSpacing < 0) - { + } else { + if (lastWordSpacing < 0) { deltaSpace = wordSpacing * getSpacingTolerance(); - } - else - { + } else { deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance(); } } @@ -624,12 +564,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine // averages) but we found that it gave the best results after numerous experiments. // Based on experiments we also found that .3 worked well. float averageCharWidth; - if (previousAveCharWidth < 0) - { + if (previousAveCharWidth < 0) { averageCharWidth = positionWidth / wordCharCount; - } - else - { + } else { averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f; } float deltaCharWidth = averageCharWidth * getAverageCharTolerance(); @@ -637,15 +574,12 @@ public class PDFTextStripper extends LegacyPDFStreamEngine // Compares the values obtained by the average method and the wordSpacing method // and picks the smaller number. float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; - if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) - { + if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE) { expectedStartOfNextWordX = endOfLastTextX + Math.min(deltaSpace, deltaCharWidth); } - if (lastPosition != null) - { - if (startOfArticle) - { + if (lastPosition != null) { + if (startOfArticle) { lastPosition.setArticleStart(); startOfArticle = false; } @@ -659,41 +593,32 @@ public class PDFTextStripper extends LegacyPDFStreamEngine // full range seen in this line. This is what I tried to do with minYTopForLine, // but this caused a lot of regression test failures. So, I'm leaving it be for // now - if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) - { + if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) { writeLine(normalize(line)); line.clear(); - lastLineStartPosition = handleLineSeparation(current, lastPosition, - lastLineStartPosition, maxHeightForLine); + lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; } // test if our TextPosition starts after a new word would be expected to start - if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE - && expectedStartOfNextWordX < positionX + if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE && expectedStartOfNextWordX < positionX // only bother adding a word separator if the last character was not a word separator && (wordSeparator.isEmpty() || // - (lastPosition.getTextPosition().getUnicode() != null - && !lastPosition.getTextPosition().getUnicode() - .endsWith(wordSeparator)))) - { + (lastPosition.getTextPosition().getUnicode() != null && !lastPosition.getTextPosition().getUnicode().endsWith(wordSeparator)))) { line.add(LineItem.getWordSeparator()); } // if there is at least the equivalent of one space // between the last character and the current one, // reset the max line height as the font size may have completely changed - if (Math.abs(position.getX() - - lastPosition.getTextPosition().getX()) > (wordSpacing + deltaSpace)) - { + if (Math.abs(position.getX() - lastPosition.getTextPosition().getX()) > (wordSpacing + deltaSpace)) { maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE; } } - if (positionY >= maxYForLine) - { + if (positionY >= maxYForLine) { maxYForLine = positionY; } // RDD - endX is what PDF considers to be the x coordinate of the @@ -701,10 +626,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine endOfLastTextX = positionX + positionWidth; // add it to the list - if (characterValue != null) - { - if (startOfPage && lastPosition == null) - { + if (characterValue != null) { + if (startOfPage && lastPosition == null) { writeParagraphStart();// not sure this is correct for RTL? } line.add(new LineItem(position)); @@ -712,8 +635,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine maxHeightForLine = Math.max(maxHeightForLine, positionHeight); minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); lastPosition = current; - if (startOfPage) - { + if (startOfPage) { lastPosition.setParagraphStart(); lastPosition.setLineStart(); lastLineStartPosition = lastPosition; @@ -723,8 +645,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine previousAveCharWidth = averageCharWidth; } // print the final line - if (line.size() > 0) - { + if (line.size() > 0) { writeLine(normalize(line)); writeParagraphEnd(); } @@ -733,79 +654,86 @@ public class PDFTextStripper extends LegacyPDFStreamEngine writePageEnd(); } - private boolean overlap(float y1, float height1, float y2, float height2) - { - return within(y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1 - || y1 <= y2 && y1 >= y2 - height2; + + private boolean overlap(float y1, float height1, float y2, float height2) { + + return within(y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1 || y1 <= y2 && y1 >= y2 - height2; } + /** * Write the line separator value to the output stream. - * + * * @throws IOException If there is a problem writing out the line separator to the document. */ - protected void writeLineSeparator() throws IOException - { + protected void writeLineSeparator() throws IOException { + output.write(getLineSeparator()); } + /** * Write the word separator value to the output stream. - * + * * @throws IOException If there is a problem writing out the word separator to the document. */ - protected void writeWordSeparator() throws IOException - { + protected void writeWordSeparator() throws IOException { + output.write(getWordSeparator()); } + /** * Write the string in TextPosition to the output stream. * * @param text The text to write to the stream. * @throws IOException If there is an error when writing the text. */ - protected void writeCharacters(TextPosition text) throws IOException - { + protected void writeCharacters(TextPosition text) throws IOException { + output.write(text.getUnicode()); } + /** * Write a Java string to the output stream. The default implementation will ignore the textPositions * and just calls {@link #writeString(String)}. * - * @param text The text to write to the stream. + * @param text The text to write to the stream. * @param textPositions The TextPositions belonging to the text. * @throws IOException If there is an error when writing the text. */ - protected void writeString(String text, List textPositions) throws IOException - { + protected void writeString(String text, List textPositions) throws IOException { + writeString(text); } + /** * Write a Java string to the output stream. * * @param text The text to write to the stream. * @throws IOException If there is an error when writing the text. */ - protected void writeString(String text) throws IOException - { + protected void writeString(String text) throws IOException { + output.write(text); } + /** * This will determine of two floating point numbers are within a specified variance. * - * @param first The first number to compare to. - * @param second The second number to compare to. + * @param first The first number to compare to. + * @param second The second number to compare to. * @param variance The allowed variance. */ - private boolean within(float first, float second, float variance) - { + private boolean within(float first, float second, float variance) { + return second < first + variance && second > first - variance; } + /** * This will process a TextPosition object and add the text to the list of characters on a page. It takes care of * overlapping text. @@ -813,19 +741,16 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * @param text The text to process. */ @Override - protected void processTextPosition(TextPosition text) - { + protected void processTextPosition(TextPosition text) { + boolean showCharacter = true; - if (suppressDuplicateOverlappingText) - { + if (suppressDuplicateOverlappingText) { showCharacter = false; String textCharacter = text.getUnicode(); float textX = text.getX(); float textY = text.getY(); - TreeMap> sameTextCharacters = characterListMapping - .get(textCharacter); - if (sameTextCharacters == null) - { + TreeMap> sameTextCharacters = characterListMapping.get(textCharacter); + if (sameTextCharacters == null) { sameTextCharacters = new TreeMap>(); characterListMapping.put(textCharacter, sameTextCharacters); } @@ -842,22 +767,17 @@ public class PDFTextStripper extends LegacyPDFStreamEngine boolean suppressCharacter = false; float tolerance = text.getWidth() / textCharacter.length() / 3.0f; - SortedMap> xMatches = sameTextCharacters.subMap(textX - tolerance, - textX + tolerance); - for (TreeSet xMatch : xMatches.values()) - { + SortedMap> xMatches = sameTextCharacters.subMap(textX - tolerance, textX + tolerance); + for (TreeSet xMatch : xMatches.values()) { SortedSet yMatches = xMatch.subSet(textY - tolerance, textY + tolerance); - if (!yMatches.isEmpty()) - { + if (!yMatches.isEmpty()) { suppressCharacter = true; break; } } - if (!suppressCharacter) - { + if (!suppressCharacter) { TreeSet ySet = sameTextCharacters.get(textX); - if (ySet == null) - { + if (ySet == null) { ySet = new TreeSet(); sameTextCharacters.put(textX, ySet); } @@ -865,8 +785,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine showCharacter = true; } } - if (showCharacter) - { + if (showCharacter) { // if we are showing the character then we need to determine which article it belongs to int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; @@ -874,62 +793,36 @@ public class PDFTextStripper extends LegacyPDFStreamEngine int notFoundButFirstAboveArticleDivisionIndex = -1; float x = text.getX(); float y = text.getY(); - if (shouldSeparateByBeads) - { - for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) - { + if (shouldSeparateByBeads) { + for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++) { PDRectangle rect = beadRectangles.get(i); - if (rect != null) - { - if (rect.contains(x, y)) - { + if (rect != null) { + if (rect.contains(x, y)) { foundArticleDivisionIndex = i * 2 + 1; - } - else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) - && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) - { + } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; - } - else if (x < rect.getLowerLeftX() - && notFoundButFirstLeftArticleDivisionIndex == -1) - { + } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i * 2; - } - else if (y < rect.getUpperRightY() - && notFoundButFirstAboveArticleDivisionIndex == -1) - { + } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i * 2; } - } - else - { + } else { foundArticleDivisionIndex = 0; } } - } - else - { + } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex; - if (foundArticleDivisionIndex != -1) - { + if (foundArticleDivisionIndex != -1) { articleDivisionIndex = foundArticleDivisionIndex; - } - else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) - { + } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; - } - else if (notFoundButFirstLeftArticleDivisionIndex != -1) - { + } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; - } - else if (notFoundButFirstAboveArticleDivisionIndex != -1) - { + } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; - } - else - { + } else { articleDivisionIndex = charactersByArticle.size() - 1; } @@ -940,37 +833,31 @@ public class PDFTextStripper extends LegacyPDFStreamEngine // graphically, the two chunks get overlaid. With text output though, // we need to do the overlay. This code recombines the diacritic with // its associated character if the two are consecutive. - if (textList.isEmpty()) - { + if (textList.isEmpty()) { textList.add(text); - } - else - { + } else { // test if we overlap the previous entry. // Note that we are making an assumption that we need to only look back // one TextPosition to find what we are overlapping. // This may not always be true. */ TextPosition previousTextPosition = textList.get(textList.size() - 1); - if (text.isDiacritic() && previousTextPosition.contains(text)) - { + if (text.isDiacritic() && previousTextPosition.contains(text)) { previousTextPosition.mergeDiacritic(text); } // If the previous TextPosition was the diacritic, merge it into this // one and remove it from the list. - else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) - { + else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) { text.mergeDiacritic(previousTextPosition); textList.remove(textList.size() - 1); textList.add(text); - } - else - { + } else { textList.add(text); } } } } + /** * This is the page that the text extraction will start on. The pages start at page 1. For example in a 5 page PDF * document, if the start page is 1 then all pages will be extracted. If the start page is 4 then pages 4 and 5 will @@ -978,21 +865,23 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * * @return Value of property startPage. */ - public int getStartPage() - { + public int getStartPage() { + return startPage; } + /** * This will set the first page to be extracted by this class. * * @param startPageValue New value of 1-based startPage property. */ - public void setStartPage(int startPageValue) - { + public void setStartPage(int startPageValue) { + startPage = startPageValue; } + /** * This will get the last page that will be extracted. This is inclusive, for example if a 5 page PDF an endPage * value of 5 would extract the entire document, an end page of 2 would extract pages 1 and 2. This defaults to @@ -1000,52 +889,57 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * * @return Value of property endPage. */ - public int getEndPage() - { + public int getEndPage() { + return endPage; } + /** * This will set the last page to be extracted by this class. * * @param endPageValue New value of 1-based endPage property. */ - public void setEndPage(int endPageValue) - { + public void setEndPage(int endPageValue) { + endPage = endPageValue; } + /** * Set the desired line separator for output text. The line.separator system property is used if the line separator * preference is not set explicitly using this method. * * @param separator The desired line separator string. */ - public void setLineSeparator(String separator) - { + public void setLineSeparator(String separator) { + lineSeparator = separator; } + /** * This will get the line separator. * * @return The desired line separator string. */ - public String getLineSeparator() - { + public String getLineSeparator() { + return lineSeparator; } + /** * This will get the word separator. * * @return The desired word separator string. */ - public String getWordSeparator() - { + public String getWordSeparator() { + return wordSeparator; } + /** * Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space * character if there is enough space between two words. By default a space character is used. If you need and @@ -1054,50 +948,55 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * * @param separator The desired page separator string. */ - public void setWordSeparator(String separator) - { + public void setWordSeparator(String separator) { + wordSeparator = separator; } + /** * @return Returns the suppressDuplicateOverlappingText. */ - public boolean getSuppressDuplicateOverlappingText() - { + public boolean getSuppressDuplicateOverlappingText() { + return suppressDuplicateOverlappingText; } + /** * Get the current page number that is being processed. * * @return A 1 based number representing the current page. */ - protected int getCurrentPageNo() - { + protected int getCurrentPageNo() { + return currentPageNo; } + /** * The output stream that is being written to. * * @return The stream that output is being written to. */ - protected Writer getOutput() - { + protected Writer getOutput() { + return output; } + /** * Character strings are grouped by articles. It is quite common that there will only be a single article. This * returns a List that contains List objects, the inner lists will contain TextPosition objects. * * @return A double List of TextPositions for all text strings on the page. */ - protected List> getCharactersByArticle() - { + protected List> getCharactersByArticle() { + return charactersByArticle; } + /** * By default the text stripper will attempt to remove text that overlapps each other. Word paints the same * character several times in order to make it look bold. By setting this to false all text will be extracted, which @@ -1105,101 +1004,111 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set. */ - public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) - { + public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) { + suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; } + /** * This will tell if the text stripper should separate by beads. * * @return If the text will be grouped by beads. */ - public boolean getSeparateByBeads() - { + public boolean getSeparateByBeads() { + return shouldSeparateByBeads; } + /** * Set if the text stripper should group the text output by a list of beads. The default value is true! * * @param aShouldSeparateByBeads The new grouping of beads. */ - public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) - { + public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) { + shouldSeparateByBeads = aShouldSeparateByBeads; } + /** * Get the bookmark where text extraction should end, inclusive. Default is null. * * @return The ending bookmark. */ - public PDOutlineItem getEndBookmark() - { + public PDOutlineItem getEndBookmark() { + return endBookmark; } + /** * Set the bookmark where the text extraction should stop. * * @param aEndBookmark The ending bookmark. */ - public void setEndBookmark(PDOutlineItem aEndBookmark) - { + public void setEndBookmark(PDOutlineItem aEndBookmark) { + endBookmark = aEndBookmark; } + /** * Get the bookmark where text extraction should start, inclusive. Default is null. * * @return The starting bookmark. */ - public PDOutlineItem getStartBookmark() - { + public PDOutlineItem getStartBookmark() { + return startBookmark; } + /** * Set the bookmark where text extraction should start, inclusive. * * @param aStartBookmark The starting bookmark. */ - public void setStartBookmark(PDOutlineItem aStartBookmark) - { + public void setStartBookmark(PDOutlineItem aStartBookmark) { + startBookmark = aStartBookmark; } + /** * This will tell if the text stripper should add some more text formatting. - * + * * @return true if some more text formatting will be added */ - public boolean getAddMoreFormatting() - { + public boolean getAddMoreFormatting() { + return addMoreFormatting; } + /** * There will some additional text formatting be added if addMoreFormatting is set to true. Default is false. - * + * * @param newAddMoreFormatting Tell PDFBox to add some more text formatting */ - public void setAddMoreFormatting(boolean newAddMoreFormatting) - { + public void setAddMoreFormatting(boolean newAddMoreFormatting) { + addMoreFormatting = newAddMoreFormatting; } + /** * This will tell if the text stripper should sort the text tokens before writing to the stream. * * @return true If the text tokens will be sorted before being written. */ - public boolean getSortByPosition() - { + public boolean getSortByPosition() { + return sortByPosition; } + /** * The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For * example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and @@ -1211,68 +1120,74 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * * @param newSortByPosition Tell PDFBox to sort the text positions. */ - public void setSortByPosition(boolean newSortByPosition) - { + public void setSortByPosition(boolean newSortByPosition) { + sortByPosition = newSortByPosition; } + /** * Get the current space width-based tolerance value that is being used to estimate where spaces in text should be * added. Note that the default value for this has been determined from trial and error. - * + * * @return The current tolerance / scaling factor */ - public float getSpacingTolerance() - { + public float getSpacingTolerance() { + return spacingTolerance; } + /** * Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note * that the default value for this has been determined from trial and error. Setting this value larger will reduce * the number of spaces added. - * + * * @param spacingToleranceValue tolerance / scaling factor to use */ - public void setSpacingTolerance(float spacingToleranceValue) - { + public void setSpacingTolerance(float spacingToleranceValue) { + spacingTolerance = spacingToleranceValue; } + /** * Get the current character width-based tolerance value that is being used to estimate where spaces in text should * be added. Note that the default value for this has been determined from trial and error. - * + * * @return The current tolerance / scaling factor */ - public float getAverageCharTolerance() - { + public float getAverageCharTolerance() { + return averageCharTolerance; } + /** * Set the character width-based tolerance value that is used to estimate where spaces in text should be added. Note * that the default value for this has been determined from trial and error. Setting this value larger will reduce * the number of spaces added. - * + * * @param averageCharToleranceValue average tolerance / scaling factor to use */ - public void setAverageCharTolerance(float averageCharToleranceValue) - { + public void setAverageCharTolerance(float averageCharToleranceValue) { + averageCharTolerance = averageCharToleranceValue; } + /** * returns the multiple of whitespace character widths for the current text which the current line start can be * indented from the previous line start beyond which the current line start is considered to be a paragraph start. - * + * * @return the number of whitespace character widths to use when detecting paragraph indents. */ - public float getIndentThreshold() - { + public float getIndentThreshold() { + return indentThreshold; } + /** * sets the multiple of whitespace character widths for the current text which the current line start can be * indented from the previous line start beyond which the current line start is considered to be a paragraph start. @@ -1280,194 +1195,202 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * * @param indentThresholdValue the number of whitespace character widths to use when detecting paragraph indents. */ - public void setIndentThreshold(float indentThresholdValue) - { + public void setIndentThreshold(float indentThresholdValue) { + indentThreshold = indentThresholdValue; } + /** * the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line * start is considered to be a paragraph start. - * + * * @return the character height multiple for max allowed whitespace between lines in the same paragraph. */ - public float getDropThreshold() - { + public float getDropThreshold() { + return dropThreshold; } + /** * sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current * line start is considered to be a paragraph start. The default value is 2.5. * * @param dropThresholdValue the character height multiple for max allowed whitespace between lines in the same - * paragraph. + * paragraph. */ - public void setDropThreshold(float dropThresholdValue) - { + public void setDropThreshold(float dropThresholdValue) { + dropThreshold = dropThresholdValue; } + /** * Returns the string which will be used at the beginning of a paragraph. - * + * * @return the paragraph start string */ - public String getParagraphStart() - { + public String getParagraphStart() { + return paragraphStart; } + /** * Sets the string which will be used at the beginning of a paragraph. - * + * * @param s the paragraph start string */ - public void setParagraphStart(String s) - { + public void setParagraphStart(String s) { + paragraphStart = s; } + /** * Returns the string which will be used at the end of a paragraph. - * + * * @return the paragraph end string */ - public String getParagraphEnd() - { + public String getParagraphEnd() { + return paragraphEnd; } + /** * Sets the string which will be used at the end of a paragraph. - * + * * @param s the paragraph end string */ - public void setParagraphEnd(String s) - { + public void setParagraphEnd(String s) { + paragraphEnd = s; } + /** * Returns the string which will be used at the beginning of a page. - * + * * @return the page start string */ - public String getPageStart() - { + public String getPageStart() { + return pageStart; } + /** * Sets the string which will be used at the beginning of a page. - * + * * @param pageStartValue the page start string */ - public void setPageStart(String pageStartValue) - { + public void setPageStart(String pageStartValue) { + pageStart = pageStartValue; } + /** * Returns the string which will be used at the end of a page. - * + * * @return the page end string */ - public String getPageEnd() - { + public String getPageEnd() { + return pageEnd; } + /** * Sets the string which will be used at the end of a page. - * + * * @param pageEndValue the page end string */ - public void setPageEnd(String pageEndValue) - { + public void setPageEnd(String pageEndValue) { + pageEnd = pageEndValue; } + /** * Returns the string which will be used at the beginning of an article. - * + * * @return the article start string */ - public String getArticleStart() - { + public String getArticleStart() { + return articleStart; } + /** * Sets the string which will be used at the beginning of an article. - * + * * @param articleStartValue the article start string */ - public void setArticleStart(String articleStartValue) - { + public void setArticleStart(String articleStartValue) { + articleStart = articleStartValue; } + /** * Returns the string which will be used at the end of an article. - * + * * @return the article end string */ - public String getArticleEnd() - { + public String getArticleEnd() { + return articleEnd; } + /** * Sets the string which will be used at the end of an article. - * + * * @param articleEndValue the article end string */ - public void setArticleEnd(String articleEndValue) - { + public void setArticleEnd(String articleEndValue) { + articleEnd = articleEndValue; } + /** * handles the line separator for a new line given the specified current and previous TextPositions. - * - * @param current the current text position - * @param lastPosition the previous text position + * + * @param current the current text position + * @param lastPosition the previous text position * @param lastLineStartPosition the last text position that followed a line separator. - * @param maxHeightForLine max height for positions since lastLineStartPosition + * @param maxHeightForLine max height for positions since lastLineStartPosition * @return start position of the last line * @throws IOException if something went wrong */ - private PositionWrapper handleLineSeparation(PositionWrapper current, - PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, - float maxHeightForLine) throws IOException - { + private PositionWrapper handleLineSeparation(PositionWrapper current, PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, + float maxHeightForLine) throws IOException { + current.setLineStart(); isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); lastLineStartPosition = current; - if (current.isParagraphStart()) - { - if (lastPosition.isArticleStart()) - { - if (lastPosition.isLineStart()) - { + if (current.isParagraphStart()) { + if (lastPosition.isArticleStart()) { + if (lastPosition.isLineStart()) { writeLineSeparator(); } writeParagraphStart(); - } - else - { + } else { writeLineSeparator(); writeParagraphSeparator(); } - } - else - { + } else { writeLineSeparator(); } return lastLineStartPosition; } + /** * tests the relationship between the last text position, the current text position and the last text position that * followed a line separator to decide if the gap represents a paragraph separation. This should only be @@ -1484,113 +1407,90 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * This method sets the isParagraphStart and isHangingIndent flags on the current position object. *

* - * @param position the current text position. This may have its isParagraphStart or isHangingIndent flags set upon - * return. - * @param lastPosition the previous text position (should not be null). + * @param position the current text position. This may have its isParagraphStart or isHangingIndent flags set upon + * return. + * @param lastPosition the previous text position (should not be null). * @param lastLineStartPosition the last text position that followed a line separator, or null. - * @param maxHeightForLine max height for text positions since lasLineStartPosition. + * @param maxHeightForLine max height for text positions since lasLineStartPosition. */ - private void isParagraphSeparation(PositionWrapper position, PositionWrapper lastPosition, - PositionWrapper lastLineStartPosition, float maxHeightForLine) - { + private void isParagraphSeparation(PositionWrapper position, PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine) { + boolean result = false; - if (lastLineStartPosition == null) - { + if (lastLineStartPosition == null) { result = true; - } - else - { - float yGap = Math.abs(position.getTextPosition().getYDirAdj() - - lastPosition.getTextPosition().getYDirAdj()); + } else { + float yGap = Math.abs(position.getTextPosition().getYDirAdj() - lastPosition.getTextPosition().getYDirAdj()); float newYVal = multiplyFloat(getDropThreshold(), maxHeightForLine); // do we need to flip this for rtl? - float xGap = position.getTextPosition().getXDirAdj() - - lastLineStartPosition.getTextPosition().getXDirAdj(); - float newXVal = multiplyFloat(getIndentThreshold(), - position.getTextPosition().getWidthOfSpace()); + float xGap = position.getTextPosition().getXDirAdj() - lastLineStartPosition.getTextPosition().getXDirAdj(); + float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace()); float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth()); - if (yGap > newYVal) - { + if (yGap > newYVal) { result = true; - } - else if (xGap > newXVal) - { + } else if (xGap > newXVal) { // text is indented, but try to screen for hanging indent - if (!lastLineStartPosition.isParagraphStart()) - { + if (!lastLineStartPosition.isParagraphStart()) { result = true; - } - else - { + } else { position.setHangingIndent(); } - } - else if (xGap < -position.getTextPosition().getWidthOfSpace()) - { + } else if (xGap < -position.getTextPosition().getWidthOfSpace()) { // text is left of previous line. Was it a hanging indent? - if (!lastLineStartPosition.isParagraphStart()) - { + if (!lastLineStartPosition.isParagraphStart()) { result = true; } - } - else if (Math.abs(xGap) < positionWidth) - { + } else if (Math.abs(xGap) < positionWidth) { // current horizontal position is within 1/4 a char of the last // linestart. We'll treat them as lined up. - if (lastLineStartPosition.isHangingIndent()) - { + if (lastLineStartPosition.isHangingIndent()) { position.setHangingIndent(); - } - else if (lastLineStartPosition.isParagraphStart()) - { + } else if (lastLineStartPosition.isParagraphStart()) { // check to see if the previous line looks like // any of a number of standard list item formats Pattern liPattern = matchListItemPattern(lastLineStartPosition); - if (liPattern != null) - { + if (liPattern != null) { Pattern currentPattern = matchListItemPattern(position); - if (liPattern == currentPattern) - { + if (liPattern == currentPattern) { result = true; } } } } } - if (result) - { + if (result) { position.setParagraphStart(); } } - private float multiplyFloat(float value1, float value2) - { + + private float multiplyFloat(float value1, float value2) { // multiply 2 floats and truncate the resulting value to 3 decimal places // to avoid wrong results when comparing with another float return Math.round(value1 * value2 * 1000) / 1000f; } + /** * writes the paragraph separator string to the output. - * + * * @throws IOException if something went wrong */ - protected void writeParagraphSeparator() throws IOException - { + protected void writeParagraphSeparator() throws IOException { + writeParagraphEnd(); writeParagraphStart(); } + /** * Write something (if defined) at the start of a paragraph. - * + * * @throws IOException if something went wrong */ - protected void writeParagraphStart() throws IOException - { - if (inParagraph) - { + protected void writeParagraphStart() throws IOException { + + if (inParagraph) { writeParagraphEnd(); inParagraph = false; } @@ -1598,77 +1498,81 @@ public class PDFTextStripper extends LegacyPDFStreamEngine inParagraph = true; } + /** * Write something (if defined) at the end of a paragraph. - * + * * @throws IOException if something went wrong */ - protected void writeParagraphEnd() throws IOException - { - if (!inParagraph) - { + protected void writeParagraphEnd() throws IOException { + + if (!inParagraph) { writeParagraphStart(); } output.write(getParagraphEnd()); inParagraph = false; } + /** * Write something (if defined) at the start of a page. - * + * * @throws IOException if something went wrong */ - protected void writePageStart() throws IOException - { + protected void writePageStart() throws IOException { + output.write(getPageStart()); } + /** * Write something (if defined) at the end of a page. - * + * * @throws IOException if something went wrong */ - protected void writePageEnd() throws IOException - { + protected void writePageEnd() throws IOException { + output.write(getPageEnd()); } + /** * returns the list item Pattern object that matches the text at the specified PositionWrapper or null if the text * does not match such a pattern. The list of Patterns tested against is given by the {@link #getListItemPatterns()} * method. To add to the list, simply override that method (if sub-classing) or explicitly supply your own list * using {@link #setListItemPatterns(List)}. - * + * * @param pw position * @return the matching pattern */ - private Pattern matchListItemPattern(PositionWrapper pw) - { + private Pattern matchListItemPattern(PositionWrapper pw) { + TextPosition tp = pw.getTextPosition(); String txt = tp.getUnicode(); return matchPattern(txt, getListItemPatterns()); } + /** * a list of regular expressions that match commonly used list item formats, i.e. bullets, numbers, letters, Roman * numerals, etc. Not meant to be comprehensive. */ - private static final String[] LIST_ITEM_EXPRESSIONS = { "\\.", "\\d+\\.", "\\[\\d+\\]", - "\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)", "[a-z]\\)", "[IVXL]+\\.", - "[ivxl]+\\.", }; + private static final String[] LIST_ITEM_EXPRESSIONS = {"\\.", "\\d+\\.", "\\[\\d+\\]", "\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)", "[a-z]\\)", "[IVXL]+\\.", "[ivxl]+\\.",}; private List listOfPatterns = null; + /** * use to supply a different set of regular expression patterns for matching list item starts. * * @param patterns list of patterns */ - protected void setListItemPatterns(List patterns) - { + protected void setListItemPatterns(List patterns) { + listOfPatterns = patterns; } + /** * returns a list of regular expression Patterns representing different common list item formats. For example * numbered items of form: @@ -1685,16 +1589,14 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * (matches "[1]", "[2]", ...). *

* This method returns a list of such regular expression Patterns. - * + * * @return a list of Pattern objects. */ - protected List getListItemPatterns() - { - if (listOfPatterns == null) - { + protected List getListItemPatterns() { + + if (listOfPatterns == null) { listOfPatterns = new ArrayList(); - for (String expression : LIST_ITEM_EXPRESSIONS) - { + for (String expression : LIST_ITEM_EXPRESSIONS) { Pattern p = Pattern.compile(expression); listOfPatterns.add(p); } @@ -1702,6 +1604,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine return listOfPatterns; } + /** * iterates over the specified list of Patterns until it finds one that matches the specified string. Then returns * the Pattern. @@ -1709,168 +1612,142 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * Order of the supplied list of patterns is important as most common patterns should come first. Patterns should be * strict in general, and all will be used with case sensitivity on. *

- * - * @param string the string to be searched + * + * @param string the string to be searched * @param patterns list of patterns * @return matching pattern */ - protected static Pattern matchPattern(String string, List patterns) - { - for (Pattern p : patterns) - { - if (p.matcher(string).matches()) - { + protected static Pattern matchPattern(String string, List patterns) { + + for (Pattern p : patterns) { + if (p.matcher(string).matches()) { return p; } } return null; } + /** * Write a list of string containing a whole line of a document. - * + * * @param line a list with the words of the given line * @throws IOException if something went wrong */ - private void writeLine(List line) - throws IOException - { + private void writeLine(List line) throws IOException { + int numberOfStrings = line.size(); - for (int i = 0; i < numberOfStrings; i++) - { + for (int i = 0; i < numberOfStrings; i++) { WordWithTextPositions word = line.get(i); writeString(word.getText(), word.getTextPositions()); - if (i < numberOfStrings - 1) - { + if (i < numberOfStrings - 1) { writeWordSeparator(); } } } + /** * Normalize the given list of TextPositions. - * + * * @param line list of TextPositions * @return a list of strings, one string for every word */ - private List normalize(List line) - { + private List normalize(List line) { + List normalized = new LinkedList(); StringBuilder lineBuilder = new StringBuilder(); List wordPositions = new ArrayList(); - for (LineItem item : line) - { + for (LineItem item : line) { lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item); } - if (lineBuilder.length() > 0) - { + if (lineBuilder.length() > 0) { normalized.add(createWord(lineBuilder.toString(), wordPositions)); } return normalized; } + /** * Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given * word. If the word is a full line, the results will be the best. If the word contains of single words or * characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and * characters! - * + *

* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx - * + * * @param word The word that shall be processed * @return new word with the correct direction of the containing characters */ - private String handleDirection(String word) - { + private String handleDirection(String word) { + Bidi bidi = new Bidi(word, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); // if there is pure LTR text no need to process further - if (!bidi.isMixed() && bidi.getBaseLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT) - { + if (!bidi.isMixed() && bidi.getBaseLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT) { return word; } - + // collect individual bidi information int runCount = bidi.getRunCount(); byte[] levels = new byte[runCount]; Integer[] runs = new Integer[runCount]; - - for (int i = 0; i < runCount; i++) - { - levels[i] = (byte)bidi.getRunLevel(i); - runs[i] = i; + + for (int i = 0; i < runCount; i++) { + levels[i] = (byte) bidi.getRunLevel(i); + runs[i] = i; } // reorder individual parts based on their levels Bidi.reorderVisually(levels, 0, runs, 0, runCount); - + // collect the parts based on the direction within the run StringBuilder result = new StringBuilder(); - for (int i = 0; i < runCount; i++) - { - int index = runs[i]; - int start = bidi.getRunStart(index); - int end = bidi.getRunLimit(index); + for (int i = 0; i < runCount; i++) { + int index = runs[i]; + int start = bidi.getRunStart(index); + int end = bidi.getRunLimit(index); int level = levels[index]; - if ((level & 1) != 0) - { - while (--end >= start) - { + if ((level & 1) != 0) { + while (--end >= start) { char character = word.charAt(end); - if (Character.isMirrored(word.codePointAt(end))) - { - if (MIRRORING_CHAR_MAP.containsKey(character)) - { + if (Character.isMirrored(word.codePointAt(end))) { + if (MIRRORING_CHAR_MAP.containsKey(character)) { result.append(MIRRORING_CHAR_MAP.get(character)); - } - else - { + } else { result.append(character); } - } - else - { + } else { result.append(character); } } - } - else - { + } else { result.append(word, start, end); } } - + return result.toString(); } - private static Map MIRRORING_CHAR_MAP = new HashMap(); - static - { + private static Map MIRRORING_CHAR_MAP = new HashMap<>(); + + static { String path = "/org/apache/pdfbox/resources/text/BidiMirroring.txt"; InputStream input = new BufferedInputStream(PDFTextStripper.class.getResourceAsStream(path)); - try - { + try { parseBidiFile(input); - } - catch (IOException e) - { - LOG.warn("Could not parse BidiMirroring.txt, mirroring char map will be empty: " - + e.getMessage()); - } - finally - { - try - { + } catch (IOException e) { + LOG.warn("Could not parse BidiMirroring.txt, mirroring char map will be empty: " + e.getMessage()); + } finally { + try { input.close(); - } - catch (IOException e) - { + } catch (IOException e) { LOG.debug("Could not close BidiMirroring.txt ", e); } } @@ -1878,43 +1755,37 @@ public class PDFTextStripper extends LegacyPDFStreamEngine /** * This method parses the bidi file provided as inputstream. - * + * * @param inputStream - The bidi file as inputstream * @throws IOException if any line could not be read by the LineNumberReader */ - private static void parseBidiFile(InputStream inputStream) throws IOException - { + private static void parseBidiFile(InputStream inputStream) throws IOException { + LineNumberReader rd = new LineNumberReader(new InputStreamReader(inputStream)); - do - { + do { String s = rd.readLine(); - if (s == null) - { + if (s == null) { break; } int comment = s.indexOf('#'); // ignore comments - if (comment != -1) - { + if (comment != -1) { s = s.substring(0, comment); } - if (s.length() < 2) - { + if (s.length() < 2) { continue; } StringTokenizer st = new StringTokenizer(s, ";"); int nFields = st.countTokens(); Character[] fields = new Character[nFields]; - for (int i = 0; i < nFields; i++) - { + for (int i = 0; i < nFields; i++) { fields[i] = (char) Integer.parseInt(st.nextToken().trim(), 16); } - if (fields.length == 2) - { + if (fields.length == 2) { // initialize the MIRRORING_CHAR_MAP MIRRORING_CHAR_MAP.put(fields[0], fields[1]); } @@ -1922,14 +1793,16 @@ public class PDFTextStripper extends LegacyPDFStreamEngine } while (true); } + /** * Used within {@link #normalize(List)} to create a single {@link WordWithTextPositions} entry. */ - private WordWithTextPositions createWord(String word, List wordPositions) - { + private WordWithTextPositions createWord(String word, List wordPositions) { + return new WordWithTextPositions(normalizeWord(word), wordPositions); } + /** * Normalize certain Unicode characters. For example, convert the single "fi" ligature to "f" and "i". Also * normalises Arabic and Hebrew presentation forms. @@ -1937,72 +1810,57 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * @param word Word to normalize * @return Normalized word */ - private String normalizeWord(String word) - { + private String normalizeWord(String word) { + StringBuilder builder = null; int p = 0; int q = 0; int strLength = word.length(); - for (; q < strLength; q++) - { + for (; q < strLength; q++) { // We only normalize if the codepoint is in a given range. // Otherwise, NFKC converts too many things that would cause // confusion. For example, it converts the micro symbol in // extended Latin to the value in the Greek script. We normalize // the Unicode Alphabetic and Arabic A&B Presentation forms. char c = word.charAt(q); - if (0xFB00 <= c && c <= 0xFDFF || 0xFE70 <= c && c <= 0xFEFF) - { - if (builder == null) - { + if (0xFB00 <= c && c <= 0xFDFF || 0xFE70 <= c && c <= 0xFEFF) { + if (builder == null) { builder = new StringBuilder(strLength * 2); } builder.append(word, p, q); // Some fonts map U+FDF2 differently than the Unicode spec. // They add an extra U+0627 character to compensate. // This removes the extra character for those fonts. - if (c == 0xFDF2 && q > 0 - && (word.charAt(q - 1) == 0x0627 || word.charAt(q - 1) == 0xFE8D)) - { + if (c == 0xFDF2 && q > 0 && (word.charAt(q - 1) == 0x0627 || word.charAt(q - 1) == 0xFE8D)) { builder.append("\u0644\u0644\u0647"); - } - else - { + } else { // Trim because some decompositions have an extra space, such as U+FC5E - builder.append(Normalizer - .normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim()); + builder.append(Normalizer.normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim()); } p = q + 1; } } - if (builder == null) - { + if (builder == null) { return handleDirection(word); - } - else - { + } else { builder.append(word, p, q); return handleDirection(builder.toString()); } } + /** * Used within {@link #normalize(List)} to handle a {@link TextPosition}. - * + * * @return The StringBuilder that must be used when calling this method. */ - private StringBuilder normalizeAdd(List normalized, - StringBuilder lineBuilder, List wordPositions, LineItem item) - { - if (item.isWordSeparator()) - { - normalized.add( - createWord(lineBuilder.toString(), new ArrayList(wordPositions))); + private StringBuilder normalizeAdd(List normalized, StringBuilder lineBuilder, List wordPositions, LineItem item) { + + if (item.isWordSeparator()) { + normalized.add(createWord(lineBuilder.toString(), new ArrayList(wordPositions))); lineBuilder = new StringBuilder(); wordPositions.clear(); - } - else - { + } else { TextPosition text = item.getTextPosition(); lineBuilder.append(text.getUnicode()); wordPositions.add(text); @@ -2010,39 +1868,47 @@ public class PDFTextStripper extends LegacyPDFStreamEngine return lineBuilder; } + /** * internal marker class. Used as a place holder in a line of TextPositions. */ - private static final class LineItem - { + private static final class LineItem { + public static LineItem WORD_SEPARATOR = new LineItem(); - public static LineItem getWordSeparator() - { + + public static LineItem getWordSeparator() { + return WORD_SEPARATOR; } + private final TextPosition textPosition; - private LineItem() - { + + private LineItem() { + textPosition = null; } - LineItem(TextPosition textPosition) - { + + LineItem(TextPosition textPosition) { + this.textPosition = textPosition; } - public TextPosition getTextPosition() - { + + public TextPosition getTextPosition() { + return textPosition; } - public boolean isWordSeparator() - { + + public boolean isWordSeparator() { + return textPosition == null; } + } /** @@ -2051,26 +1917,30 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * * @author Axel Dörfler */ - private static final class WordWithTextPositions - { + private static final class WordWithTextPositions { + String text; List textPositions; - WordWithTextPositions(String word, List positions) - { + + WordWithTextPositions(String word, List positions) { + text = word; textPositions = positions; } - public String getText() - { + + public String getText() { + return text; } - public List getTextPositions() - { + + public List getTextPositions() { + return textPositions; } + } /** @@ -2080,11 +1950,11 @@ public class PDFTextStripper extends LegacyPDFStreamEngine * to subclasses. Also, conceptually TextPosition is immutable while these flags need to be set post-creation so it * makes sense to put these flags in this separate class. *

- * + * * @author m.martinez@ll.mit.edu */ - private static final class PositionWrapper - { + private static final class PositionWrapper { + private boolean isLineStart = false; private boolean isParagraphStart = false; private boolean isPageBreak = false; @@ -2093,89 +1963,103 @@ public class PDFTextStripper extends LegacyPDFStreamEngine private TextPosition position = null; + /** * Constructs a PositionWrapper around the specified TextPosition object. * * @param position the text position. */ - PositionWrapper(TextPosition position) - { + PositionWrapper(TextPosition position) { + this.position = position; } + /** * Returns the underlying TextPosition object. - * + * * @return the text position */ - public TextPosition getTextPosition() - { + public TextPosition getTextPosition() { + return position; } - public boolean isLineStart() - { + + public boolean isLineStart() { + return isLineStart; } + /** * Sets the isLineStart() flag to true. */ - public void setLineStart() - { + public void setLineStart() { + this.isLineStart = true; } - public boolean isParagraphStart() - { + + public boolean isParagraphStart() { + return isParagraphStart; } + /** * sets the isParagraphStart() flag to true. */ - public void setParagraphStart() - { + public void setParagraphStart() { + this.isParagraphStart = true; } - public boolean isArticleStart() - { + + public boolean isArticleStart() { + return isArticleStart; } + /** * Sets the isArticleStart() flag to true. */ - public void setArticleStart() - { + public void setArticleStart() { + this.isArticleStart = true; } - public boolean isPageBreak() - { + + public boolean isPageBreak() { + return isPageBreak; } + /** * Sets the isPageBreak() flag to true. */ - public void setPageBreak() - { + public void setPageBreak() { + this.isPageBreak = true; } - public boolean isHangingIndent() - { + + public boolean isHangingIndent() { + return isHangingIndent; } + /** * Sets the isHangingIndent() flag to true. */ - public void setHangingIndent() - { + public void setHangingIndent() { + this.isHangingIndent = true; } + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java index 330c9c15..724351e3 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java @@ -137,8 +137,7 @@ public class TextPositionSequence implements CharSequence { if (textPositions.get(0).getRotation() == 90) { return textPositions.get(0).getYDirAdj(); } else { - return textPositions.get(textPositions.size() - 1) - .getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1; + return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1; } } @@ -262,8 +261,7 @@ public class TextPositionSequence implements CharSequence { @JsonAttribute(ignore = true) public Rectangle getRectangle() { - log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, toString(), textPositions.get(0) - .getRotation(), textPositions.get(0).getDir()); + log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, textPositions.get(0).getRotation(), textPositions.get(0).getDir()); float height = getTextHeight(); @@ -275,36 +273,30 @@ public class TextPositionSequence implements CharSequence { if (textPositions.get(0).getRotation() == 0 && textPositions.get(0).getDir() == 90f) { posYInit = getX1(); - posYEnd = getX2() + textPositions.get(0).getWidthDirAdj() - textPositions.get(textPositions.size() - 1) - .getWidthDirAdj() - 3; + posYEnd = getX2() + textPositions.get(0).getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3; posXInit = textPositions.get(0).getYDirAdj() + 2; posXEnd = textPositions.get(textPositions.size() - 1).getYDirAdj() - height; } else if (textPositions.get(0).getRotation() == 0 && textPositions.get(0).getDir() == 180f) { posXInit = textPositions.get(0).getPageWidth() - getX1() + 1; - posXEnd = textPositions.get(0).getPageWidth() - getX2() + textPositions.get(0) - .getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3; + posXEnd = textPositions.get(0).getPageWidth() - getX2() + textPositions.get(0).getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3; posYInit = textPositions.get(0).getYDirAdj() - height + 2; posYEnd = textPositions.get(textPositions.size() - 1).getYDirAdj() - height + 2; } else if (textPositions.get(0).getRotation() == 0 && textPositions.get(0).getDir() == 270f) { posYInit = textPositions.get(0).getPageHeight() - getX1(); - posYEnd = textPositions.get(0).getPageHeight() - getX2() - textPositions.get(0) - .getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3; + posYEnd = textPositions.get(0).getPageHeight() - getX2() - textPositions.get(0).getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3; posXInit = textPositions.get(0).getPageWidth() - textPositions.get(0).getYDirAdj() - 2; - posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1) - .getYDirAdj() + height; + posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1).getYDirAdj() + height; } else if (textPositions.get(0).getRotation() == 90 && textPositions.get(0).getDir() == 0.0f) { - posXInit = textPositions.get(textPositions.size() - 1) - .getXDirAdj() + textPositions.get(textPositions.size() - 1).getHeightDir(); + posXInit = textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getHeightDir(); posXEnd = textPositions.get(0).getXDirAdj(); posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2; - posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1) - .getYDirAdj() + 2; + posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1).getYDirAdj() + 2; } else if (textPositions.get(0).getRotation() == 90 && textPositions.get(0).getDir() == 90.0f) { posXEnd = textPositions.get(0).getYDirAdj() + 2; @@ -313,29 +305,23 @@ public class TextPositionSequence implements CharSequence { } else if (textPositions.get(0).getRotation() == 90 && textPositions.get(0).getDir() == 180.0f) { - posXInit = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1) - .getXDirAdj() - 4; + posXInit = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1).getXDirAdj() - 4; posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(0).getXDirAdj(); - posYInit = textPositions.get(0).getYDirAdj() - 2 - textPositions.get(textPositions.size() - 1) - .getHeightDir(); - posYEnd = textPositions.get(textPositions.size() - 1) - .getYDirAdj() - textPositions.get(textPositions.size() - 1).getHeightDir(); + posYInit = textPositions.get(0).getYDirAdj() - 2 - textPositions.get(textPositions.size() - 1).getHeightDir(); + posYEnd = textPositions.get(textPositions.size() - 1).getYDirAdj() - textPositions.get(textPositions.size() - 1).getHeightDir(); } else if (textPositions.get(0).getRotation() == 90 && textPositions.get(0).getDir() == 270.0f) { posXInit = textPositions.get(0).getPageWidth() - getX1(); posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(0).getYDirAdj() - 2; posYInit = textPositions.get(0).getPageHeight() - getY1(); - posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1) - .getXDirAdj() - height - 4; + posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1).getXDirAdj() - height - 4; } else if (textPositions.get(0).getRotation() == 180 && textPositions.get(0).getDir() == 0f) { - posXEnd = textPositions.get(textPositions.size() - 1) - .getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + 1; + posXEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + 1; posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2; - posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1) - .getYDirAdj() + 2; + posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1).getYDirAdj() + 2; } else if (textPositions.get(0).getRotation() == 180 && textPositions.get(0).getDir() == 90f) { @@ -347,27 +333,23 @@ public class TextPositionSequence implements CharSequence { } else if (textPositions.get(0).getRotation() == 180 && textPositions.get(0).getDir() == 180f) { posXInit = textPositions.get(0).getPageWidth() - getX1() + 1; - posXEnd = textPositions.get(0).getPageWidth() - getX2() + textPositions.get(0) - .getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3; + posXEnd = textPositions.get(0).getPageWidth() - getX2() + textPositions.get(0).getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj() - 3; posYInit = textPositions.get(0).getYDirAdj() - height + 2; posYEnd = textPositions.get(textPositions.size() - 1).getYDirAdj() - height + 2; } else if (textPositions.get(0).getRotation() == 180 && textPositions.get(0).getDir() == 270.0f) { posYInit = textPositions.get(0).getPageHeight() - getX1(); - posYEnd = textPositions.get(0).getPageHeight() - getX2() - textPositions.get(0) - .getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj(); + posYEnd = textPositions.get(0).getPageHeight() - getX2() - textPositions.get(0).getWidthDirAdj() - textPositions.get(textPositions.size() - 1).getWidthDirAdj(); posXInit = textPositions.get(0).getPageWidth() - textPositions.get(0).getYDirAdj() - 2; - posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1) - .getYDirAdj() + height; + posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1).getYDirAdj() + height; } else if (textPositions.get(0).getRotation() == 270 && textPositions.get(0).getDir() == 0.0f) { posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2; posYEnd = posYInit + 1; posXInit = textPositions.get(0).getXDirAdj(); - posXEnd = textPositions.get(textPositions.size() - 1) - .getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + 0.1f; + posXEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + 0.1f; } else if (textPositions.get(0).getRotation() == 270 && textPositions.get(0).getDir() == 90.0f) { @@ -388,16 +370,13 @@ public class TextPositionSequence implements CharSequence { posYInit = textPositions.get(0).getPageHeight() - getX1(); posYEnd = textPositions.get(0).getPageHeight() - getX2() - height; posXInit = textPositions.get(0).getPageWidth() - textPositions.get(0).getYDirAdj() - 2; - posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1) - .getYDirAdj() + height; + posXEnd = textPositions.get(0).getPageWidth() - textPositions.get(textPositions.size() - 1).getYDirAdj() + height; } else { // page rotation = 0 and text direction = 0 - posXEnd = textPositions.get(textPositions.size() - 1) - .getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + 1; + posXEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + 1; posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2; - posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1) - .getYDirAdj() + 2; + posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1).getYDirAdj() + 2; } var rectangle = new Rectangle(new Point(posXInit, posYInit), posXEnd - posXInit, posYEnd - posYInit + height, page); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/RedactionMessageReceiver.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/RedactionMessageReceiver.java index 5fed1c56..f5335eb4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/RedactionMessageReceiver.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/queue/RedactionMessageReceiver.java @@ -1,5 +1,11 @@ package com.iqser.red.service.redaction.v1.server.queue; +import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfiguration.REDACTION_DQL; + +import org.springframework.amqp.rabbit.annotation.RabbitHandler; +import org.springframework.amqp.rabbit.annotation.RabbitListener; +import org.springframework.stereotype.Service; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; @@ -8,13 +14,9 @@ import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest; import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService; import com.iqser.red.service.redaction.v1.server.redaction.service.ManualRedactionSurroundingTextService; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.springframework.amqp.rabbit.annotation.RabbitHandler; -import org.springframework.amqp.rabbit.annotation.RabbitListener; -import org.springframework.stereotype.Service; - -import static com.iqser.red.service.redaction.v1.server.queue.MessagingConfiguration.REDACTION_DQL; @Slf4j @Service @@ -55,6 +57,9 @@ public class RedactionMessageReceiver { result = manualRedactionSurroundingTextService.addSurroundingText(analyzeRequest.getDossierId(), analyzeRequest.getFileId(), analyzeRequest.getManualRedactions()); log.info("Successfully added surrounding text for manual redaction in dossierId {} and fileId {} took: {}", analyzeRequest.getDossierId(), analyzeRequest.getFileId(), result.getDuration()); break; + + default: + throw new IllegalArgumentException("Unknown MessageType: " + analyzeRequest.getMessageType()); } result.setMessageType(analyzeRequest.getMessageType()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index cdade187..8c67a896 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -643,7 +643,7 @@ public class Section { while (matcher.find()) { String match = matcher.group(group); if (StringUtils.isNotBlank(match) && match.length() >= 3) { - localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(match); + localDictionaryAdds.computeIfAbsent(asType, x -> new HashSet<>()).add(match); } } } @@ -801,7 +801,7 @@ public class Section { } if (StringUtils.isNotBlank(cleanValue) && cleanValue.length() >= 3) { - localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(cleanValue); + localDictionaryAdds.computeIfAbsent(asType, x -> new HashSet<>()).add(cleanValue); } } } @@ -865,7 +865,7 @@ public class Section { if (StringUtils.isNotBlank(match) && match.length() >= 3) { Set found = findEntities(match.trim(), asType, false, redaction, ruleNumber, reason, legalBasis, Engine.RULE, false); EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); - localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(match); + localDictionaryAdds.computeIfAbsent(asType, x -> new HashSet<>()).add(match); } } } @@ -980,7 +980,7 @@ public class Section { EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); if (redactEverywhere && !isLocal()) { - localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(value.trim()); + localDictionaryAdds.computeIfAbsent(asType, x -> new HashSet<>()).add(value.trim()); } } } @@ -1018,7 +1018,7 @@ public class Section { EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); if (redactEverywhere && !isLocal()) { - localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(value.trim()); + localDictionaryAdds.computeIfAbsent(asType, x -> new HashSet<>()).add(value.trim()); } } } @@ -1046,7 +1046,7 @@ public class Section { EntitySearchUtils.addEntitiesWithHigherRank(entities, found, dictionary); if (redactEverywhere && !isLocal()) { - localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(line.trim()); + localDictionaryAdds.computeIfAbsent(asType, x -> new HashSet<>()).add(line.trim()); } } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index f7b82c49..33b41bd6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -221,20 +221,18 @@ public class EntityRedactionService { private void addLocalValuesToDictionary(Section analysedSection, Dictionary dictionary) { - analysedSection.getLocalDictionaryAdds().keySet().forEach(key -> { - analysedSection.getLocalDictionaryAdds().get(key).forEach(value -> { + analysedSection.getLocalDictionaryAdds().keySet().forEach(key -> analysedSection.getLocalDictionaryAdds().get(key).forEach(value -> { - if (dictionary.getLocalAccessMap().get(key) == null) { - log.warn("Dictionary {} is null", key); - } + if (dictionary.getLocalAccessMap().get(key) == null) { + log.warn("Dictionary {} is null", key); + } - if (dictionary.getLocalAccessMap().get(key).getLocalEntries() == null) { - log.warn("Dictionary {} localEntries is null", key); - } + if (dictionary.getLocalAccessMap().get(key).getLocalEntries() == null) { + log.warn("Dictionary {} localEntries is null", key); + } - dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value); - }); - }); + dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value); + })); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java index 6c2e9921..56405a56 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java @@ -1,18 +1,28 @@ package com.iqser.red.service.redaction.v1.server.redaction.utils; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus; import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; -import com.iqser.red.service.redaction.v1.server.redaction.model.*; +import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.EntityType; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; +import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; -import io.micrometer.core.annotation.Timed; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; -import java.util.*; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - @Slf4j @UtilityClass @SuppressWarnings("PMD") @@ -93,33 +103,31 @@ public class EntitySearchUtils { } - private Set applyResizeRedactions(Set entitiesWithPositions, ManualRedactions manualRedactions) { + private void applyResizeRedactions(Set entitiesWithPositions, ManualRedactions manualRedactions) { - if (manualRedactions == null || manualRedactions.getResizeRedactions() == null || manualRedactions.getResizeRedactions().isEmpty()){ - return entitiesWithPositions; + if (manualRedactions == null || manualRedactions.getResizeRedactions() == null || manualRedactions.getResizeRedactions().isEmpty()) { + return; } - entitiesWithPositions.forEach(e -> e.getPositionSequences().forEach(pos -> { - manualRedactions.getResizeRedactions().stream().filter(resize -> resize.getStatus().equals(AnnotationStatus.APPROVED)).forEach(resize -> { - if (resize.getAnnotationId().equals(pos.getId())) { - if (resize.getValue().length() < e.getWord().length() && e.getWord().contains(resize.getValue())) { - int start = e.getWord().indexOf(resize.getValue()); - e.setStart(e.getStart() + start); - e.setEnd(e.getStart() + resize.getValue().length()); - e.setResized(true); - e.setWord(resize.getValue()); - } else if(resize.getValue().length() > e.getWord().length() && resize.getValue().contains(e.getWord())){ - int start = resize.getValue().indexOf(e.getWord()); - e.setStart(e.getStart() - start); - e.setEnd(e.getStart() + resize.getValue().length()); - e.setResized(true); - e.setWord(resize.getValue()); + entitiesWithPositions.forEach(e -> e.getPositionSequences() + .forEach(pos -> manualRedactions.getResizeRedactions().stream().filter(resize -> resize.getStatus().equals(AnnotationStatus.APPROVED)).forEach(resize -> { + if (resize.getAnnotationId().equals(pos.getId())) { + if (resize.getValue().length() < e.getWord().length() && e.getWord().contains(resize.getValue())) { + int start = e.getWord().indexOf(resize.getValue()); + e.setStart(e.getStart() + start); + e.setEnd(e.getStart() + resize.getValue().length()); + e.setResized(true); + e.setWord(resize.getValue()); + } else if (resize.getValue().length() > e.getWord().length() && resize.getValue().contains(e.getWord())) { + int start = resize.getValue().indexOf(e.getWord()); + e.setStart(e.getStart() - start); + e.setEnd(e.getStart() + resize.getValue().length()); + e.setResized(true); + e.setWord(resize.getValue()); + } } - } - }); - })); + }))); - return entitiesWithPositions; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index ffd0e004..efbf1b5e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -1,27 +1,5 @@ package com.iqser.red.service.redaction.v1.server.segmentation; -import com.iqser.red.service.redaction.v1.server.classification.model.Document; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; -import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService; -import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; -import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; -import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.SystemUtils; -import org.apache.pdfbox.io.MemoryUsageSetting; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.springframework.stereotype.Service; - import java.io.File; import java.io.FileOutputStream; import java.io.IOException; @@ -35,6 +13,30 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.SystemUtils; +import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.springframework.stereotype.Service; + +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.Page; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; +import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService; +import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; +import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; +import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + @Slf4j @Service @RequiredArgsConstructor @@ -72,7 +74,7 @@ public class PdfSegmentationService { Document document = new Document(); List pages = new ArrayList<>(); - pdDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupMixed(1024 * 1024 * 64)); + pdDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupMixed(67108864L)); pdDocument.setAllSecurityToBeRemoved(true); long pageCount = pdDocument.getNumberOfPages(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index f5a01599..4287408d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -1,37 +1,28 @@ package com.iqser.red.service.redaction.v1.server; -import com.amazonaws.services.s3.AmazonS3; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus; -import com.iqser.red.service.persistence.service.v1.api.model.annotations.Comment; -import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions; -import com.iqser.red.service.persistence.service.v1.api.model.annotations.Rectangle; -import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.*; -import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type; -import com.iqser.red.service.redaction.v1.model.*; -import com.iqser.red.service.redaction.v1.server.annotate.AnnotateRequest; -import com.iqser.red.service.redaction.v1.server.annotate.AnnotateResponse; -import com.iqser.red.service.redaction.v1.server.annotate.AnnotationService; -import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; -import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; -import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; -import com.iqser.red.service.redaction.v1.server.client.RulesClient; -import com.iqser.red.service.redaction.v1.server.controller.RedactionController; -import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService; -import com.iqser.red.service.redaction.v1.server.redaction.service.ManualRedactionSurroundingTextService; -import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils; -import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; -import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; -import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; -import com.iqser.red.storage.commons.StorageAutoConfiguration; -import com.iqser.red.storage.commons.service.StorageService; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.when; -import lombok.SneakyThrows; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.junit.After; @@ -57,16 +48,49 @@ import org.springframework.context.annotation.Primary; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit4.SpringRunner; -import java.io.*; -import java.net.URL; -import java.nio.charset.StandardCharsets; -import java.time.OffsetDateTime; -import java.time.ZoneOffset; -import java.util.*; -import java.util.stream.Collectors; +import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.AnnotationStatus; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.Comment; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.ManualRedactions; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.Rectangle; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.IdRemoval; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualForceRedaction; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualImageRecategorization; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualLegalBasisChange; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualRedactionEntry; +import com.iqser.red.service.persistence.service.v1.api.model.annotations.entitymapped.ManualResizeRedaction; +import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type; +import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; +import com.iqser.red.service.redaction.v1.model.AnalyzeResult; +import com.iqser.red.service.redaction.v1.model.FileAttribute; +import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; +import com.iqser.red.service.redaction.v1.model.RedactionRequest; +import com.iqser.red.service.redaction.v1.model.RedactionResult; +import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest; +import com.iqser.red.service.redaction.v1.server.annotate.AnnotateRequest; +import com.iqser.red.service.redaction.v1.server.annotate.AnnotateResponse; +import com.iqser.red.service.redaction.v1.server.annotate.AnnotationService; +import com.iqser.red.service.redaction.v1.server.classification.model.SectionText; +import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; +import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; +import com.iqser.red.service.redaction.v1.server.client.RulesClient; +import com.iqser.red.service.redaction.v1.server.controller.RedactionController; +import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService; +import com.iqser.red.service.redaction.v1.server.redaction.service.ManualRedactionSurroundingTextService; +import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils; +import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; +import com.iqser.red.storage.commons.StorageAutoConfiguration; +import com.iqser.red.storage.commons.service.StorageService; -import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.when; +import lombok.SneakyThrows; @RunWith(SpringRunner.class) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @@ -1712,10 +1736,10 @@ public class RedactionIntegrationTest { return; } if (entry.getValue().equals("David")){ - assertThat(entry.getImportedRedactionIntersections().size()).isEqualTo(1); + assertThat(entry.getImportedRedactionIntersections()).hasSize(1); } if (entry.getValue().equals("annotation")){ - assertThat(entry.getImportedRedactionIntersections().size()).isEqualTo(0); + assertThat(entry.getImportedRedactionIntersections()).isEmpty(); } }); @@ -1728,12 +1752,12 @@ public class RedactionIntegrationTest { @Test public void testExpandByPrefixRegEx() throws IOException { - assertThat(dictionary.get(AUTHOR).contains("Robinson")); - assertThat(!dictionary.get(AUTHOR).contains("Mrs. Robinson")); - assertThat(dictionary.get(AUTHOR).contains("Bojangles")); - assertThat(!dictionary.get(AUTHOR).contains("Mr. Bojangles")); - assertThat(dictionary.get(AUTHOR).contains("Tambourine Man")); - assertThat(!dictionary.get(AUTHOR).contains("Mr. Tambourine Man")); + assertThat(dictionary.get(AUTHOR)).contains("Robinson"); + assertThat(dictionary.get(AUTHOR)).doesNotContain("Mrs. Robinson"); + assertThat(dictionary.get(AUTHOR)).contains("Bojangles"); + assertThat(dictionary.get(AUTHOR)).doesNotContain("Mr. Bojangles"); + assertThat(dictionary.get(AUTHOR)).contains("Tambourine Man"); + assertThat(dictionary.get(AUTHOR)).doesNotContain("Mr. Tambourine Man"); String fileName = "files/mr-mrs.pdf"; String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf"; diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/LiveDataIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/LiveDataIntegrationTest.java index 2ddc3ec3..c7ea9305 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/LiveDataIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/realdata/LiveDataIntegrationTest.java @@ -1,22 +1,17 @@ package com.iqser.red.service.redaction.v1.server.realdata; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.nullable; +import static org.mockito.Mockito.when; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type; -import com.iqser.red.service.redaction.v1.server.Application; -import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService; -import com.iqser.red.service.redaction.v1.server.client.*; -import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver; -import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService; -import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; -import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; -import com.iqser.red.storage.commons.StorageAutoConfiguration; -import com.iqser.red.storage.commons.service.StorageService; -import lombok.SneakyThrows; import org.apache.commons.io.IOUtils; import org.junit.Before; import org.junit.Test; @@ -35,13 +30,27 @@ import org.springframework.core.io.Resource; import org.springframework.core.io.support.ResourcePatternResolver; import org.springframework.test.context.junit4.SpringRunner; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.Collectors; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.persistence.service.v1.api.model.common.JSONPrimitive; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.configuration.Colors; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.DictionaryEntry; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.type.Type; +import com.iqser.red.service.redaction.v1.server.Application; +import com.iqser.red.service.redaction.v1.server.FileSystemBackedStorageService; +import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; +import com.iqser.red.service.redaction.v1.server.client.EntityRecognitionClient; +import com.iqser.red.service.redaction.v1.server.client.FileStatusProcessingUpdateClient; +import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; +import com.iqser.red.service.redaction.v1.server.client.RulesClient; +import com.iqser.red.service.redaction.v1.server.queue.RedactionMessageReceiver; +import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService; +import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; +import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; +import com.iqser.red.storage.commons.StorageAutoConfiguration; +import com.iqser.red.storage.commons.service.StorageService; -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; -import static org.mockito.ArgumentMatchers.*; -import static org.mockito.Mockito.when; +import lombok.SneakyThrows; @RunWith(SpringRunner.class) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @@ -114,9 +123,8 @@ public class LiveDataIntegrationTest { ObjectMapper objectMapper = new ObjectMapper(); var jsonNode = objectMapper.readTree(new ClassPathResource(BASE_DIR + EFSA_SANITISATION_GFL_V1 + "types.json").getInputStream()); - types = objectMapper.treeToValue(jsonNode.get("types"), objectMapper.getTypeFactory().constructType( - new TypeReference>() { - })); + types = objectMapper.treeToValue(jsonNode.get("types"), objectMapper.getTypeFactory().constructType(new TypeReference>() { + })); types.forEach(t -> { t.setId(t.getType()); @@ -126,27 +134,28 @@ public class LiveDataIntegrationTest { when(dictionaryClient.getAllTypesForDossierTemplate(anyString(), anyBoolean())).thenReturn(types); when(dictionaryClient.getAllTypesForDossier(anyString(), anyBoolean())).thenReturn(new ArrayList<>()); - when(dictionaryClient.getColors(anyString())).thenReturn(objectMapper.readValue(new ClassPathResource(BASE_DIR + EFSA_SANITISATION_GFL_V1 + "colors.json").getInputStream(), Colors.class)); when(dictionaryClient.getDictionaryForType(anyString(), nullable(Long.class))).then(answer -> { String typeName = answer.getArgument(0); var found = types.stream().filter(t -> t.getType().equalsIgnoreCase(typeName)).findFirst(); - if(found.isPresent()) { + if (found.isPresent()) { var type = types.stream().filter(t -> t.getType().equalsIgnoreCase(typeName)).findFirst().get(); type.setEntries(getEntries(typeName, type.getTypeId())); return type; - }else{ + } else { return null; } }); - dictionaryService.updateDictionary("dossierTemplateId","dossierId"); + dictionaryService.updateDictionary("dossierTemplateId", "dossierId"); } + public void simulateIncrement(List values, String deltaTypeName, long version) { + when(dictionaryClient.getVersion(anyString())).thenReturn(version); when(dictionaryClient.getVersionForDossier(anyString())).thenReturn(version); @@ -154,7 +163,7 @@ public class LiveDataIntegrationTest { String typeName = answer.getArgument(0); var found = types.stream().filter(t -> t.getType().equalsIgnoreCase(typeName)).findFirst(); - if(found.isPresent()) { + if (found.isPresent()) { var type = types.stream().filter(t -> t.getType().equalsIgnoreCase(typeName)).findFirst().get(); @@ -165,7 +174,7 @@ public class LiveDataIntegrationTest { } return type; - }else{ + } else { return null; } }); @@ -174,17 +183,20 @@ public class LiveDataIntegrationTest { @Test public void testUpdateDictionary() { + dictionaryService.updateDictionary("dossierTemplateId", "dossierId"); var dict = dictionaryService.getDeepCopyDictionary("dossierTemplateId", "dossierId"); assertThat(dict.getLocalAccessMap().size()).isEqualTo(12); } + @SneakyThrows private List getEntries(String typeName, String typeId) { + Resource[] dictionaryResources = resourcePatternResolver.getResources("classpath:" + BASE_DIR + EFSA_SANITISATION_GFL_V1 + "dictionaries/**"); for (var resource : dictionaryResources) { - if (resource.getFilename().contains(typeName)) { + if (Objects.requireNonNull(resource.getFilename()).contains(typeName)) { List lines = IOUtils.readLines(resource.getInputStream()); return lines.stream().map(l -> new DictionaryEntry(0, l, 0L, false, typeId)).collect(Collectors.toList()); @@ -193,4 +205,5 @@ public class LiveDataIntegrationTest { } return new ArrayList<>(); } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java index 2e423cb5..18307012 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java @@ -1,23 +1,17 @@ package com.iqser.red.service.redaction.v1.server.segmentation; -import com.amazonaws.services.s3.AmazonS3; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.service.redaction.v1.server.Application; -import com.iqser.red.service.redaction.v1.server.classification.model.Document; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; -import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; -import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D; -import com.iqser.red.service.redaction.v1.server.redaction.model.image.ImageServiceResponse; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; -import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; -import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; -import org.junit.Ignore; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.stream.Collectors; + import org.junit.Test; -import org.junit.platform.commons.util.StringUtils; import org.junit.runner.RunWith; import org.kie.api.runtime.KieContainer; import org.springframework.amqp.rabbit.core.RabbitTemplate; @@ -31,20 +25,20 @@ import org.springframework.context.annotation.Import; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit4.SpringRunner; -import javax.imageio.ImageIO; -import java.io.ByteArrayOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.stream.Collectors; - -import static com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils.getTemporaryDirectory; -import static org.assertj.core.api.Assertions.assertThat; +import com.amazonaws.services.s3.AmazonS3; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.redaction.v1.server.Application; +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; +import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient; +import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; +import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D; +import com.iqser.red.service.redaction.v1.server.redaction.model.image.ImageServiceResponse; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; +import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; +import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; import lombok.SneakyThrows; @@ -89,15 +83,18 @@ public class PdfSegmentationServiceTest { @Test @SneakyThrows - public void testMapping(){ + public void testMapping() { + ClassPathResource responseJson = new ClassPathResource("files/image_response.json"); ImageServiceResponse imageServiceResponse = objectMapper.readValue(responseJson.getInputStream(), ImageServiceResponse.class); Map> images = new HashMap<>(); - imageServiceResponse.getData().stream().forEach(imageMetadata -> { - images.computeIfAbsent(imageMetadata.getPosition().getPageNumber() ,x -> new ArrayList<>()) - .add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1(), imageMetadata.getGeometry().getWidth(), imageMetadata.getGeometry().getHeight()), ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber())); - }); + imageServiceResponse.getData() + .forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) + .add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1(), imageMetadata.getGeometry() + .getWidth(), imageMetadata.getGeometry().getHeight()), ImageType.valueOf(imageMetadata.getClassification() + .getLabel() + .toUpperCase(Locale.ROOT)), imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber()))); System.out.println("object"); }