Merge branch 'DM-307-2' into 'master'

DM-307: Enabled to configure custom blockification for DocuMine

Closes DM-307

See merge request redactmanager/redaction-service!20
This commit is contained in:
Dominique Eifländer 2023-06-27 17:09:14 +02:00
commit 0fa9f01273
4 changed files with 481 additions and 256 deletions

View File

@ -19,260 +19,7 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.mo
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
@Service
@SuppressWarnings("all")
public class BlockificationService {
static final float THRESHOLD = 1f;
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null;
if (!chunkBlockList.isEmpty()) {
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList.add(cb1);
}
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
TextPageBlock previousLeft = null;
TextPageBlock previousRight = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
previousLeft.add(block);
itty.remove();
continue;
}
}
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
previousRight.add(block);
itty.remove();
continue;
}
}
if (block.getOrientation().equals(Orientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
previousRight = block;
}
}
itty = chunkBlockList.iterator();
TextPageBlock previous = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
}
previous = block;
}
return new ClassificationPage(chunkBlockList);
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
public interface BlockificationService {
ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines);
}

View File

@ -0,0 +1,198 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
@Service
@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "DocuMine")
@SuppressWarnings("all")
public class DocuMineBlockificationService implements BlockificationService{
static final float THRESHOLD = 1f;
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY)) {
Orientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
}
TextPageBlock cb1 = buildTextBlock(chunkWords);
chunkBlockList1.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords);
if (cb1 != null) {
chunkBlockList1.add(cb1);
}
return new ClassificationPage(chunkBlockList1);
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|| isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) //
|| isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()); //
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -0,0 +1,280 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil;
@Service
@SuppressWarnings("all")
@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "RedactManager")
public class RedactManagerBlockificationService implements BlockificationService{
static final float THRESHOLD = 1f;
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null;
if (!chunkBlockList.isEmpty()) {
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList.add(cb1);
}
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
TextPageBlock previousLeft = null;
TextPageBlock previousRight = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
previousLeft.add(block);
itty.remove();
continue;
}
}
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
previousRight.add(block);
itty.remove();
continue;
}
}
if (block.getOrientation().equals(Orientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
previousRight = block;
}
}
itty = chunkBlockList.iterator();
TextPageBlock previous = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
}
previous = block;
}
return new ClassificationPage(chunkBlockList);
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -43,7 +43,7 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
@Test
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/474_G.1.2 - 1768300_MMNA_A13617AV_report.pdf");
AnalyzeRequest request = uploadFileToStorage("files/Documine/Flora/A13617AV/403_F.2 - A13617AV - Acute Inhalation Toxicity - Rats.pdf");
System.out.println("Start Full integration test");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
System.out.println("Finished structure analysis");