diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Orientation.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Orientation.java new file mode 100644 index 00000000..e0f4e1a9 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Orientation.java @@ -0,0 +1,6 @@ +package com.iqser.red.service.redaction.v1.server.classification.model; + +public enum Orientation { + + NONE, LEFT, RIGHT +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java index 63cfc11c..396006ae 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java @@ -32,6 +32,7 @@ public class TextBlock extends AbstractTextContainer { private String classification; + public TextBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation) { this.minX = minX; this.maxX = maxX; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java index 4badfec4..d394bbe6 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java @@ -1,6 +1,7 @@ package com.iqser.red.service.redaction.v1.server.classification.service; import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter; +import com.iqser.red.service.redaction.v1.server.classification.model.Orientation; import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; @@ -11,16 +12,21 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; + import org.springframework.stereotype.Service; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; @Service @SuppressWarnings("all") public class BlockificationService { - public Page blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + static final float THRESHOLD = 1f; + + public Page blockify(List textPositions, List horizontalRulingLines, + List verticalRulingLines) { List chunkWords = new ArrayList<>(); List chunkBlockList1 = new ArrayList<>(); @@ -28,21 +34,37 @@ public class BlockificationService { float minX = 1000, maxX = 0, minY = 1000, maxY = 0; TextPositionSequence prev = null; + boolean wasSplitted = false; + Float splitX1 = null; for (TextPositionSequence word : textPositions) { boolean lineSeparation = minY - word.getY2() > word.getHeight() * 1.25; boolean startFromTop = word.getY1() > maxY + word.getHeight(); + boolean splitByX = prev != null && maxX + 50 < word.getX1() && prev.getY1() == word.getY1(); + boolean newLineAfterSplit = prev != null && word.getY1() != prev.getY1() && wasSplitted && splitX1 != word.getX1(); + boolean splittedByRuling = word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), verticalRulingLines) || word + .getRotation() == 0 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), horizontalRulingLines) || word + .getRotation() == 90 && isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), horizontalRulingLines) || word + .getRotation() == 90 && isSplittedByRuling(minX, minY, word.getX1(), word.getY2(), verticalRulingLines); - if (prev != null && (lineSeparation || startFromTop || word.getRotation() == 0 && isSplittedByRuling(maxX, minY, word - .getX1(), word.getY1(), verticalRulingLines) || word.getRotation() == 0 && isSplittedByRuling(minX, minY, word - .getX1(), word.getY2(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(maxX, minY, word - .getX1(), word.getY1(), horizontalRulingLines) || word.getRotation() == 90 && isSplittedByRuling(minX, minY, word - .getX1(), word.getY2(), verticalRulingLines))) { + if (prev != null && (lineSeparation || startFromTop || splitByX || newLineAfterSplit || splittedByRuling)) { TextBlock cb1 = buildTextBlock(chunkWords); chunkBlockList1.add(cb1); chunkWords = new ArrayList<>(); + if (splitByX && !splittedByRuling) { + wasSplitted = true; + cb1.setOrientation(Orientation.LEFT); + splitX1 = word.getX1(); + } + + if (newLineAfterSplit && !splittedByRuling) { + wasSplitted = false; + cb1.setOrientation(Orientation.RIGHT); + splitX1 = null; + } + minX = 1000; maxX = 0; minY = 1000; @@ -72,9 +94,62 @@ public class BlockificationService { chunkBlockList1.add(cb1); } + Iterator itty = chunkBlockList1.iterator(); + + TextBlock previousLeft = null; + TextBlock previousRight = null; + while (itty.hasNext()) { + TextBlock block = (TextBlock) itty.next(); + + if(previousLeft != null && block.getOrientation().equals(Orientation.LEFT)){ + if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()){ + previousLeft.add(block); + itty.remove(); + continue; + } + } + + if(previousRight != null && block.getOrientation().equals(Orientation.RIGHT)){ + if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()){ + previousRight.add(block); + itty.remove(); + continue; + } + } + + if (block.getOrientation().equals(Orientation.LEFT)) { + previousLeft = block; + } else if (block.getOrientation().equals(Orientation.RIGHT)) { + previousRight = block; + } + } + + + itty = chunkBlockList1.iterator(); + TextBlock previous = null; + while (itty.hasNext()) { + TextBlock block = (TextBlock) itty.next(); + + if(previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), previous + .getMaxY())|| + previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous + .getMaxY())){ + previous.add(block); + itty.remove(); + continue; + } + + previous = block; + } + + return new Page(chunkBlockList1); } + private boolean equalsWithThreshold(float f1, float f2){ + return Math.abs(f1 - f2) < THRESHOLD; + } + private TextBlock buildTextBlock(List wordBlockList) { @@ -117,7 +192,8 @@ public class BlockificationService { } - private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines) { + private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1, + List rulingLines) { for (Ruling ruling : rulingLines) { if (ruling.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { @@ -128,7 +204,8 @@ public class BlockificationService { } - public Rectangle calculateBodyTextFrame(List pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) { + public Rectangle calculateBodyTextFrame(List pages, FloatFrequencyCounter documentFontSizeCounter, + boolean landscape) { float minX = 10000; float maxX = -100; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java index b050e27b..cb7cfde5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java @@ -2,6 +2,8 @@ package com.iqser.red.service.redaction.v1.server.tableextraction.model; import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.model.Rectangle; +import com.iqser.red.service.redaction.v1.server.classification.model.Orientation; + import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; @@ -18,6 +20,8 @@ public abstract class AbstractTextContainer { protected String classification; protected int page; + private Orientation orientation = Orientation.NONE; + public abstract String getText(); public boolean contains(AbstractTextContainer other) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfVisualisationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfVisualisationService.java index 06ccb399..390810a4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfVisualisationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfVisualisationService.java @@ -102,7 +102,7 @@ public class PdfVisualisationService { contentStream.newLineAtOffset(textBlock.getMinX(), textBlock.getMaxY()); - contentStream.showText(textBlock.getClassification()); + contentStream.showText(textBlock.getClassification() + textBlock.getOrientation()); contentStream.endText(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index cb0f03da..a878400d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -680,8 +680,8 @@ public class RedactionIntegrationTest { dictionary.get(AUTHOR).add("physical"); reanlysisVersions.put("physical", 2L); - dictionary.get(VERTEBRATE).add("s-metolachlor"); - reanlysisVersions.put("s-metolachlor", 3L); +// dictionary.get(VERTEBRATE).add("s-metolachlor"); +// reanlysisVersions.put("s-metolachlor", 3L); when(dictionaryClient.getVersion(TEST_DOSSIER_TEMPLATE_ID, DictionaryResource.GLOBAL_DOSSIER)).thenReturn(3L); @@ -805,7 +805,7 @@ public class RedactionIntegrationTest { public void classificationTest() throws IOException { System.out.println("classificationTest"); - ClassPathResource pdfFileResource = new ClassPathResource("files/Trinexapac/93 Trinexapac-ethyl_RAR_03_Volume_3CA_B-1_2017-03-31.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf"); AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt index c04fedbe..8de2feb7 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/CBI_address.txt @@ -1652,3 +1652,4 @@ Zoecon Corp. Zoecon Corp., Palo Alto, USA Zyma SA Zyma SA, Nyon, Switzerland +Mambo-Tox Ltd. Biomedical Sciences Building Bassett Crescent East Southampton SO16 7PX UK