diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java index d67447a9..d7a96673 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java @@ -19,260 +19,7 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.mo import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil; -@Service -@SuppressWarnings("all") -public class BlockificationService { - - static final float THRESHOLD = 1f; - - - /** - * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. - * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! - * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. - * - * @param textPositions The words of a page. - * @param horizontalRulingLines Horizontal table lines. - * @param verticalRulingLines Vertical table lines. - * @return Page object that contains the Textblock and text statistics. - */ - public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { - - int indexOnPage = 0; - List chunkWords = new ArrayList<>(); - List chunkBlockList = new ArrayList<>(); - - float minX = 1000, maxX = 0, minY = 1000, maxY = 0; - TextPositionSequence prev = null; - - boolean wasSplitted = false; - Float splitX1 = null; - for (TextPositionSequence word : textPositions) { - - boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25; - boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); - boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); - boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; - boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); - boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); - - if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { - - Orientation prevOrientation = null; - if (!chunkBlockList.isEmpty()) { - prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation(); - } - - TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); - indexOnPage++; - - chunkBlockList.add(cb1); - chunkWords = new ArrayList<>(); - - if (splitByX && !isSplitByRuling) { - wasSplitted = true; - cb1.setOrientation(Orientation.LEFT); - splitX1 = word.getMinXDirAdj(); - } else if (newLineAfterSplit && !isSplitByRuling) { - wasSplitted = false; - cb1.setOrientation(Orientation.RIGHT); - splitX1 = null; - } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { - cb1.setOrientation(Orientation.LEFT); - } - - minX = 1000; - maxX = 0; - minY = 1000; - maxY = 0; - prev = null; - } - - chunkWords.add(word); - - prev = word; - if (word.getMinXDirAdj() < minX) { - minX = word.getMinXDirAdj(); - } - if (word.getMaxXDirAdj() > maxX) { - maxX = word.getMaxXDirAdj(); - } - if (word.getMinYDirAdj() < minY) { - minY = word.getMinYDirAdj(); - } - if (word.getMaxYDirAdj() > maxY) { - maxY = word.getMaxYDirAdj(); - } - } - - TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); - if (cb1 != null) { - chunkBlockList.add(cb1); - } - - Iterator itty = chunkBlockList.iterator(); - - TextPageBlock previousLeft = null; - TextPageBlock previousRight = null; - while (itty.hasNext()) { - TextPageBlock block = (TextPageBlock) itty.next(); - - if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) { - if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) { - previousLeft.add(block); - itty.remove(); - continue; - } - } - - if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) { - if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) { - previousRight.add(block); - itty.remove(); - continue; - } - } - - if (block.getOrientation().equals(Orientation.LEFT)) { - previousLeft = block; - } else if (block.getOrientation().equals(Orientation.RIGHT)) { - previousRight = block; - } - } - - itty = chunkBlockList.iterator(); - TextPageBlock previous = null; - while (itty.hasNext()) { - TextPageBlock block = (TextPageBlock) itty.next(); - - if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), - previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() - .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { - previous.add(block); - itty.remove(); - continue; - } - - previous = block; - } - - return new ClassificationPage(chunkBlockList); - } - - - private boolean equalsWithThreshold(float f1, float f2) { - - return Math.abs(f1 - f2) < THRESHOLD; - } - - - private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { - - TextPageBlock textBlock = null; - - FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); - StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); - StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); - - for (TextPositionSequence wordBlock : wordBlockList) { - - lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); - fontSizeFrequencyCounter.add(wordBlock.getFontSize()); - spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); - fontFrequencyCounter.add(wordBlock.getFont()); - styleFrequencyCounter.add(wordBlock.getFontStyle()); - - if (textBlock == null) { - textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); - } else { - TextPageBlock spatialEntity = textBlock.union(wordBlock); - textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); - } - } - - if (textBlock != null) { - textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); - textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); - } - - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { - textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); - } - return textBlock; - } - - - private boolean isSplitByRuling(float minX, - float minY, - float maxX, - float maxY, - TextPositionSequence word, - List horizontalRulingLines, - List verticalRulingLines) { - - return isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()); - } - - - private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { - - for (Ruling ruling : rulingLines) { - var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); - if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { - return true; - } - } - return false; - } - - - private double round(float value, int decimalPoints) { - - var d = Math.pow(10, decimalPoints); - return Math.round(value * d) / d; - } +public interface BlockificationService { + ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineBlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineBlockificationService.java new file mode 100644 index 00000000..cd9b7c00 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineBlockificationService.java @@ -0,0 +1,198 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; + +import static java.util.stream.Collectors.toSet; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil; + +@Service +@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "DocuMine") +@SuppressWarnings("all") +public class DocuMineBlockificationService implements BlockificationService{ + + static final float THRESHOLD = 1f; + + + /** + * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. + * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! + * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * @param textPositions The words of a page. + * @param horizontalRulingLines Horizontal table lines. + * @param verticalRulingLines Vertical table lines. + * @return Page object that contains the Textblock and text statistics. + */ + public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + List chunkWords = new ArrayList<>(); + List chunkBlockList1 = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + boolean wasSplitted = false; + Float splitX1 = null; + for (TextPositionSequence word : textPositions) { + + boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25; + boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); + boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); + boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; + boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); + + if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY)) { + + Orientation prevOrientation = null; + if (!chunkBlockList1.isEmpty()) { + prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation(); + } + + TextPageBlock cb1 = buildTextBlock(chunkWords); + chunkBlockList1.add(cb1); + chunkWords = new ArrayList<>(); + + if (splitByX && !isSplitByRuling) { + wasSplitted = true; + cb1.setOrientation(Orientation.LEFT); + splitX1 = word.getMinXDirAdj(); + } else if (newLineAfterSplit && !isSplitByRuling) { + wasSplitted = false; + cb1.setOrientation(Orientation.RIGHT); + splitX1 = null; + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + cb1.setOrientation(Orientation.LEFT); + } + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + chunkWords.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock cb1 = buildTextBlock(chunkWords); + if (cb1 != null) { + chunkBlockList1.add(cb1); + } + + return new ClassificationPage(chunkBlockList1); + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private TextPageBlock buildTextBlock(List wordBlockList) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) // + || isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) // + || isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) // + || isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()); // + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + + + private double round(float value, int decimalPoints) { + + var d = Math.pow(10, decimalPoints); + return Math.round(value * d) / d; + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RedactManagerBlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RedactManagerBlockificationService.java new file mode 100644 index 00000000..d303e79e --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RedactManagerBlockificationService.java @@ -0,0 +1,280 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; + +import static java.util.stream.Collectors.toSet; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil; + +@Service +@SuppressWarnings("all") +@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "RedactManager") +public class RedactManagerBlockificationService implements BlockificationService{ + + static final float THRESHOLD = 1f; + + + /** + * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. + * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! + * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * + * @param textPositions The words of a page. + * @param horizontalRulingLines Horizontal table lines. + * @param verticalRulingLines Vertical table lines. + * @return Page object that contains the Textblock and text statistics. + */ + public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + int indexOnPage = 0; + List chunkWords = new ArrayList<>(); + List chunkBlockList = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + boolean wasSplitted = false; + Float splitX1 = null; + for (TextPositionSequence word : textPositions) { + + boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25; + boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); + boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); + boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; + boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + + if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { + + Orientation prevOrientation = null; + if (!chunkBlockList.isEmpty()) { + prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation(); + } + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + indexOnPage++; + + chunkBlockList.add(cb1); + chunkWords = new ArrayList<>(); + + if (splitByX && !isSplitByRuling) { + wasSplitted = true; + cb1.setOrientation(Orientation.LEFT); + splitX1 = word.getMinXDirAdj(); + } else if (newLineAfterSplit && !isSplitByRuling) { + wasSplitted = false; + cb1.setOrientation(Orientation.RIGHT); + splitX1 = null; + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + cb1.setOrientation(Orientation.LEFT); + } + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + chunkWords.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + if (cb1 != null) { + chunkBlockList.add(cb1); + } + + Iterator itty = chunkBlockList.iterator(); + + TextPageBlock previousLeft = null; + TextPageBlock previousRight = null; + while (itty.hasNext()) { + TextPageBlock block = (TextPageBlock) itty.next(); + + if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) { + if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) { + previousLeft.add(block); + itty.remove(); + continue; + } + } + + if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) { + if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) { + previousRight.add(block); + itty.remove(); + continue; + } + } + + if (block.getOrientation().equals(Orientation.LEFT)) { + previousLeft = block; + } else if (block.getOrientation().equals(Orientation.RIGHT)) { + previousRight = block; + } + } + + itty = chunkBlockList.iterator(); + TextPageBlock previous = null; + while (itty.hasNext()) { + TextPageBlock block = (TextPageBlock) itty.next(); + + if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), + previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() + .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { + previous.add(block); + itty.remove(); + continue; + } + + previous = block; + } + + return new ClassificationPage(chunkBlockList); + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + + + private double round(float value, int decimalPoints) { + + var d = Math.pow(10, decimalPoints); + return Math.round(value * d) / d; + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java index 8488370d..60ea3603 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/DocumineFloraTest.java @@ -43,7 +43,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest { @Test public void titleExtraction() throws IOException { - AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/474_G.1.2 - 1768300_MMNA_A13617AV_report.pdf"); + AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/403_F.2 - A13617AV - Acute Inhalation Toxicity - Rats.pdf"); System.out.println("Start Full integration test"); analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); System.out.println("Finished structure analysis");