RED-5381: Fixed calculation of textblocks and body text frame for rotated text and rotated pages

This commit is contained in:
deiflaender 2022-10-13 09:10:15 +02:00
parent ddbf80e4a6
commit aa43453206
53 changed files with 687 additions and 5653 deletions

View File

@ -1,16 +1,18 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@RequiredArgsConstructor
public class Page {
@ -32,7 +34,8 @@ public class Page {
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private double cropBoxArea;
private float pageWidth;
private float pageHeight;
public boolean isRotated() {

View File

@ -1,8 +1,12 @@
package com.iqser.red.service.redaction.v1.server.classification.model;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextDirection;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
@ -12,9 +16,6 @@ import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@AllArgsConstructor
@Builder
@Data
@ -50,6 +51,139 @@ public class TextBlock extends AbstractTextContainer {
private String classification;
@JsonIgnore
@JsonAttribute(ignore = true)
public TextDirection getDir() {
return sequences.get(0).getDir();
}
@JsonIgnore
@JsonAttribute(ignore = true)
private float getPageHeight() {
return sequences.get(0).getPageHeight();
}
@JsonIgnore
@JsonAttribute(ignore = true)
private float getPageWidth() {
return sequences.get(0).getPageWidth();
}
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minX value in pdf coordinate system
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getPdfMinX() {
if (getDir().getDegrees() == 90) {
return minY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - maxX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - maxY;
} else {
return minX;
}
}
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxX value in pdf coordinate system
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getPdfMaxX() {
if (getDir().getDegrees() == 90) {
return maxY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - minX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - minY;
} else {
return maxX;
}
}
/**
* Returns the minY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minY value in pdf coordinate system
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getPdfMinY() {
if (getDir().getDegrees() == 90) {
return minX;
} else if (getDir().getDegrees() == 180) {
return maxY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - maxX;
} else {
return getPageHeight() - maxY;
}
}
/**
* Returns the maxY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxY value in pdf coordinate system
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getPdfMaxY() {
if (getDir().getDegrees() == 90) {
return maxX;
} else if (getDir().getDegrees() == 180) {
return minY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - minX;
} else {
return getPageHeight() - minY;
}
}
public TextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
this.minX = minX;
@ -97,17 +231,17 @@ public class TextBlock extends AbstractTextContainer {
public void add(TextPositionSequence r) {
if (r.getX1() < minX) {
minX = r.getX1();
if (r.getMinXDirAdj() < minX) {
minX = r.getMinXDirAdj();
}
if (r.getX2() > maxX) {
maxX = r.getX2();
if (r.getMaxXDirAdj() > maxX) {
maxX = r.getMaxXDirAdj();
}
if (r.getY1() < minY) {
minY = r.getY1();
if (r.getMinYDirAdj() < minY) {
minY = r.getMinYDirAdj();
}
if (r.getY2() > maxY) {
maxY = r.getY2();
if (r.getMaxYDirAdj() > maxY) {
maxY = r.getMaxYDirAdj();
}
}
@ -162,7 +296,7 @@ public class TextBlock extends AbstractTextContainer {
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');

View File

@ -14,13 +14,10 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Orientatio
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.classification.utils.RulingTextDirAdjustUtil;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@Service
@SuppressWarnings("all")
@ -29,10 +26,17 @@ public class BlockificationService {
static final float THRESHOLD = 1f;
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics.
*/
public Page blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
sortRotatedSequences(textPositions);
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractTextContainer> chunkBlockList1 = new ArrayList<>();
@ -43,23 +47,15 @@ public class BlockificationService {
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = minY - word.getY2() > word.getHeight() * 1.25;
boolean startFromTop = word.getY1() > maxY + word.getHeight();
boolean splitByX = prev != null && maxX + 50 < word.getX1() && prev.getY1() == word.getY1();
boolean newLineAfterSplit = prev != null && word.getY1() != prev.getY1() && wasSplitted && splitX1 != word.getX1();
boolean splittedByRuling = isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), verticalRulingLines) || isSplittedByRuling(minX,
minY,
word.getX1(),
word.getY2(),
horizontalRulingLines)
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSpitByRuling = isSpitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|| isSplittedByRuling(maxX, minY, word.getX1(), word.getY1(), horizontalRulingLines) || isSplittedByRuling(minX,
minY,
word.getX1(),
word.getY2(),
verticalRulingLines);
if (prev != null && (lineSeparation || startFromTop || splitByX || newLineAfterSplit || splittedByRuling)) {
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSpitByRuling)) {
Orientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {
@ -70,15 +66,15 @@ public class BlockificationService {
chunkBlockList1.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !splittedByRuling) {
if (splitByX && !isSpitByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getX1();
} else if (newLineAfterSplit && !splittedByRuling) {
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !isSpitByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !splittedByRuling)) {
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSpitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
@ -92,17 +88,17 @@ public class BlockificationService {
chunkWords.add(word);
prev = word;
if (word.getX1() < minX) {
minX = word.getX1();
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getX2() > maxX) {
maxX = word.getX2();
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getY1() < minY) {
minY = word.getY1();
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getY2() > maxY) {
maxY = word.getY2();
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
@ -186,7 +182,7 @@ public class BlockificationService {
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextBlock(wordBlock.getX1(), wordBlock.getX2(), wordBlock.getY1(), wordBlock.getY2(), wordBlockList, wordBlock.getRotation());
textBlock = new TextBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation());
} else {
TextBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
@ -202,17 +198,58 @@ public class BlockificationService {
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getY1(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getX1));
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplittedByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines) {
private boolean isSpitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) || isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) || isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) || isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
if (ruling.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
@ -220,103 +257,6 @@ public class BlockificationService {
}
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
float minX = 10000;
float maxX = -100;
float minY = 10000;
float maxY = -100;
for (Page page : pages) {
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
continue;
}
for (AbstractTextContainer container : page.getTextBlocks()) {
if (container instanceof TextBlock) {
TextBlock textBlock = (TextBlock) container;
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (approxLineCount < 2.9f) {
continue;
}
if (documentFontSizeCounter.getMostPopular() != null) {
if (textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) {
if (textBlock.getMinX() < minX) {
minX = textBlock.getMinX();
}
if (textBlock.getMaxX() > maxX) {
maxX = textBlock.getMaxX();
}
if (textBlock.getMinY() < minY) {
minY = textBlock.getMinY();
}
if (textBlock.getMaxY() > maxY) {
maxY = textBlock.getMaxY();
}
}
}
}
if (container instanceof Table) {
Table table = (Table) container;
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (cell == null || cell.getTextBlocks() == null) {
continue;
}
for (TextBlock textBlock : cell.getTextBlocks()) {
if (textBlock.getMinX() < minX) {
minX = textBlock.getMinX();
}
if (textBlock.getMaxX() > maxX) {
maxX = textBlock.getMaxX();
}
if (textBlock.getMinY() < minY) {
minY = textBlock.getMinY();
}
if (textBlock.getMaxY() > maxY) {
maxY = textBlock.getMaxY();
}
}
}
}
}
}
}
return new Rectangle(minY, minX, maxX - minX, maxY - minY);
}
private void sortRotatedSequences(List<TextPositionSequence> sequences) {
List<TextPositionSequence> rotatedWords = new ArrayList<>();
Iterator<TextPositionSequence> itty = sequences.iterator();
while (itty.hasNext()) {
var pos = itty.next();
if (pos.getTextPositions().get(0).getDir() == 270) {
rotatedWords.add(pos);
itty.remove();
}
}
if (!rotatedWords.isEmpty() && !sequences.isEmpty()) {
rotatedWords.sort(Comparator.comparing(TextPositionSequence::getX1));
}
sequences.addAll(rotatedWords);
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);

View File

@ -0,0 +1,171 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@Service
public class BodyTextFrameService {
/**
* Adjusts and sets the body text frame to a page.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
* The aspect ratio of the page is also regarded.
*
* @param page The page
* @param bodyTextFrame frame that contains the main text on portrait pages
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
*/
public void setBodyTextFrameAdjustedToPage(Page page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
textFrame.getHeight(),
textFrame.getWidth(),
0);
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
} else if (page.getRotation() == 180) {
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
textFrame.getWidth(),
textFrame.getHeight(),
0);
}
page.setBodyTextFrame(textFrame);
}
/**
* Calculates the frame that contains the main text, text outside the frame will be e.g. headers or footers.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
* The aspect ratio of the page is also regarded.
*
* @param pages List of all pages
* @param documentFontSizeCounter Statistics of the document
* @param landscape Calculate for landscape or portrait
* @return Rectangle of the text frame
*/
public Rectangle calculateBodyTextFrame(List<Page> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
float minX = 10000;
float maxX = -100;
float minY = 10000;
float maxY = -100;
for (Page page : pages) {
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
continue;
}
for (AbstractTextContainer container : page.getTextBlocks()) {
if (container instanceof TextBlock) {
TextBlock textBlock = (TextBlock) container;
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (approxLineCount < 2.9f) {
continue;
}
if (documentFontSizeCounter.getMostPopular() != null && textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) {
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
if (textBlock.getPdfMinY() < minX) {
minX = textBlock.getPdfMinY();
}
if (textBlock.getPdfMaxY() > maxX) {
maxX = textBlock.getPdfMaxY();
}
if (textBlock.getPdfMinX() < minY) {
minY = textBlock.getPdfMinX();
}
if (textBlock.getPdfMaxX() > maxY) {
maxY = textBlock.getPdfMaxX();
}
} else {
if (textBlock.getPdfMinX() < minX) {
minX = textBlock.getPdfMinX();
}
if (textBlock.getPdfMaxX() > maxX) {
maxX = textBlock.getPdfMaxX();
}
if (textBlock.getPdfMinY() < minY) {
minY = textBlock.getPdfMinY();
}
if (textBlock.getPdfMaxY() > maxY) {
maxY = textBlock.getPdfMaxY();
}
}
}
}
if (container instanceof Table) {
Table table = (Table) container;
for (List<Cell> row : table.getRows()) {
for (Cell cell : row) {
if (cell == null || cell.getTextBlocks() == null) {
continue;
}
for (TextBlock textBlock : cell.getTextBlocks()) {
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
if (textBlock.getPdfMinY() < minX) {
minX = textBlock.getMinY();
}
if (textBlock.getPdfMaxY() > maxX) {
maxX = textBlock.getPdfMaxY();
}
if (textBlock.getPdfMinX() < minY) {
minY = textBlock.getPdfMinX();
}
if (textBlock.getPdfMaxX() > maxY) {
maxY = textBlock.getPdfMaxX();
}
} else {
if (textBlock.getPdfMinX() < minX) {
minX = textBlock.getPdfMinX();
}
if (textBlock.getPdfMaxX() > maxX) {
maxX = textBlock.getPdfMaxX();
}
if (textBlock.getPdfMinY() < minY) {
minY = textBlock.getPdfMinY();
}
if (textBlock.getPdfMaxY() > maxY) {
maxY = textBlock.getPdfMaxY();
}
}
}
}
}
}
}
}
return new Rectangle(new Point(minX, minY), maxX - minX, maxY - minY, 0);
}
}

View File

@ -1,76 +1,75 @@
package com.iqser.red.service.redaction.v1.server.classification.service;
import java.util.List;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.List;
import java.util.regex.Pattern;
@Slf4j
@Service
@RequiredArgsConstructor
public class ClassificationService {
private final BlockificationService blockificationService;
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(Document document) {
Rectangle bodyTextFrame = blockificationService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = blockificationService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (Page page : document.getPages()) {
Rectangle btf = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
page.setBodyTextFrame(btf);
classifyPage(btf, page, document, headlineFontSizes);
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
public void classifyPage(Rectangle bodyTextFrame, Page page, Document document, List<Float> headlineFontSizes) {
public void classifyPage(Page page, Document document, List<Float> headlineFontSizes) {
for (AbstractTextContainer textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextBlock) {
classifyBlock((TextBlock) textBlock, bodyTextFrame, page, document, headlineFontSizes);
classifyBlock((TextBlock) textBlock, page, document, headlineFontSizes);
}
}
}
public void classifyBlock(TextBlock textBlock, Rectangle bodyTextFrame, Page page, Document document, List<Float> headlineFontSizes) {
public void classifyBlock(TextBlock textBlock, Page page, Document document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification("Other");
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.isRotated()) && (document.getFontSizeCounter()
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification("Header");
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock) && (document.getFontSizeCounter()
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification("Footer");
} else if (page.getPageNumber() == 1 && (!PositionUtils.isTouchingUnderBodyTextFrame(bodyTextFrame,
textBlock) && PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification("Title");
}
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()

View File

@ -1,7 +1,7 @@
package com.iqser.red.service.redaction.v1.server.classification.utils;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import lombok.experimental.UtilityClass;
@ -9,17 +9,19 @@ import lombok.experimental.UtilityClass;
@SuppressWarnings("all")
public class PositionUtils {
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isWithinBodyTextFrame(Rectangle btf, TextBlock textBlock) {
//TODO Currently this is not working for rotated pages.
if (btf == null || textBlock == null) {
return false;
}
double threshold = textBlock.getMostPopularWordHeight() * 3;
if (textBlock.getMinX() + threshold > btf.getX() && textBlock.getMaxX() - threshold < btf.getX() + btf.getWidth() && textBlock.getMinY() + threshold > btf.getY() && textBlock.getMaxY() - threshold < btf.getY() + btf.getHeight()) {
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft()
.getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft()
.getY() + btf.getHeight()) {
return true;
} else {
return false;
@ -28,16 +30,27 @@ public class PositionUtils {
}
public boolean isOverBodyTextFrame(Rectangle btf, TextBlock textBlock, boolean rotated) {
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isOverBodyTextFrame(Rectangle btf, TextBlock textBlock, int rotation) {
if (btf == null || textBlock == null) {
return false;
}
if (rotated && textBlock.getMinX() < btf.getX()) {
// Its very strange, P{0,0} is on top left in this case, instead of lower left.
if (rotation == 90 && textBlock.getPdfMaxX() < btf.getTopLeft().getX()) {
return true;
} else if (!rotated && textBlock.getMinY() > btf.getY() + btf.getHeight()) {
}
if (rotation == 180 && textBlock.getPdfMaxY() < btf.getTopLeft().getY()) {
return true;
}
if (rotation == 270 && textBlock.getPdfMinX() > btf.getTopLeft().getX() + btf.getWidth()) {
return true;
}
if (rotation == 0 && textBlock.getPdfMinY() > btf.getTopLeft().getY() + btf.getHeight()) {
return true;
} else {
return false;
@ -45,16 +58,27 @@ public class PositionUtils {
}
public boolean isUnderBodyTextFrame(Rectangle btf, TextBlock textBlock) {
//TODO Currently this is not working for rotated pages.
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isUnderBodyTextFrame(Rectangle btf, TextBlock textBlock, int rotation) {
if (btf == null || textBlock == null) {
return false;
}
if (textBlock.getMaxY() < btf.getY()) {
if (rotation == 90 && textBlock.getPdfMinX() > btf.getTopLeft().getX() + btf.getWidth()) {
return true;
}
if (rotation == 180 && textBlock.getPdfMinY() > btf.getTopLeft().getY() + btf.getHeight()) {
return true;
}
if (rotation == 270 && textBlock.getPdfMaxX() < btf.getTopLeft().getX()) {
return true;
}
if (rotation == 0 && textBlock.getPdfMaxY() < btf.getTopLeft().getY()) {
return true;
} else {
return false;
@ -62,7 +86,8 @@ public class PositionUtils {
}
// TODO This currently uses pdf coord system. In the futher this should use java coord system.
// Note: DirAdj (TextDirection Adjusted) can not be user for this.
public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextBlock textBlock) {
//TODO Currently this is not working for rotated pages.
@ -71,7 +96,7 @@ public class PositionUtils {
return false;
}
if (textBlock.getMinY() < btf.getY()) {
if (textBlock.getMinY() < btf.getTopLeft().getY()) {
return true;
} else {
return false;

View File

@ -0,0 +1,67 @@
package com.iqser.red.service.redaction.v1.server.classification.utils;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import lombok.experimental.UtilityClass;
@UtilityClass
public class RulingTextDirAdjustUtil {
/**
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
*
* See org.apache.pdfbox.text.TextPosition
*/
public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) {
return new Line2D.Float(convertPoint(ruling.x1, ruling.y1, dir, pageWidth, pageHeight), convertPoint(ruling.x2, ruling.y2, dir, pageWidth, pageHeight));
}
private Point2D convertPoint(float x, float y, float dir, float pageWidth, float pageHeight) {
var xAdj = getXRot(x, y, dir, pageWidth, pageHeight);
var yAdj = 0f;
if (dir == 0 || dir == 180) {
yAdj = pageHeight - getYLowerLeftRot(x, y, dir, pageWidth, pageHeight);
} else {
yAdj = pageWidth - getYLowerLeftRot(x, y, dir, pageWidth, pageHeight);
}
return new Point2D.Float(xAdj, yAdj);
}
private float getXRot(float x, float y, float dir, float pageWidth, float pageHeight) {
if (dir == 0) {
return x;
} else if (dir == 90) {
return y;
} else if (dir == 180) {
return pageWidth - x;
} else if (dir == 270) {
return pageHeight - y;
}
return 0;
}
private float getYLowerLeftRot(float x, float y, float dir, float pageWidth, float pageHeight) {
if (dir == 0) {
return y;
} else if (dir == 90) {
return pageWidth - x;
} else if (dir == 180) {
return pageHeight - y;
} else if (dir == 270) {
return x;
}
return 0;
}
}

View File

@ -47,16 +47,6 @@ public class TextPositionSequence implements CharSequence {
}
public TextPositionSequence fromData(List<RedTextPosition> textPositions, int page) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions;
textPositionSequence.page = page;
return textPositionSequence;
}
public TextPositionSequence(List<TextPosition> textPositions, int page) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
@ -147,59 +137,63 @@ public class TextPositionSequence implements CharSequence {
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getX1() {
public float getMinXDirAdj() {
return textPositions.get(0).getXDirAdj();
if (rotation == 90) {
return textPositions.get(0).getYDirAdj() - getTextHeight();
} else {
return textPositions.get(0).getXDirAdj();
}
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getX2() {
public float getMaxXDirAdj() {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
if (rotation == 90) {
return textPositions.get(0).getYDirAdj();
} else {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
}
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getRotationAdjustedY() {
public float getMinYDirAdj() {
return textPositions.get(0).getYDirAdj() - getTextHeight();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMaxYDirAdj() {
return textPositions.get(0).getYDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getY1() {
if (rotation == 90) {
return textPositions.get(0).getXDirAdj();
} else {
return pageHeight - textPositions.get(0).getYDirAdj();
}
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getY2() {
if (rotation == 90) {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + getTextHeight() - HEIGHT_PADDING;
} else {
return pageHeight - textPositions.get(0).getYDirAdj() + getTextHeight();
}
}
@ -215,7 +209,7 @@ public class TextPositionSequence implements CharSequence {
@JsonAttribute(ignore = true)
public float getHeight() {
return getY2() - getY1();
return getMaxYDirAdj() - getMinYDirAdj();
}
@ -223,7 +217,7 @@ public class TextPositionSequence implements CharSequence {
@JsonAttribute(ignore = true)
public float getWidth() {
return getX2() - getX1();
return getMaxXDirAdj() - getMinXDirAdj();
}
@ -270,6 +264,15 @@ public class TextPositionSequence implements CharSequence {
}
/**
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return bounding box of the word in Pdf Coordinate System
*/
@JsonIgnore
@JsonAttribute(ignore = true)
@SneakyThrows

View File

@ -33,7 +33,7 @@ public class CellValue {
TextPositionSequence previous = null;
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');

View File

@ -194,7 +194,7 @@ public class SearchableText {
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
@ -228,7 +228,7 @@ public class SearchableText {
for (TextPositionSequence word : sorted) {
if (previous != null) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
@ -249,7 +249,7 @@ public class SearchableText {
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');

View File

@ -43,12 +43,13 @@ public class SectionGridCreatorService {
if (textBlock instanceof TextBlock) {
TextBlock tb = (TextBlock) textBlock;
classifiedDoc.getSectionGrid()
.getRectanglesPerPage()
.computeIfAbsent(page, (x) -> new ArrayList<>())
.add(new SectionRectangle(new Point(textBlock.getMinX(), textBlock.getMinY()),
textBlock.getWidth(),
textBlock.getHeight(),
.add(new SectionRectangle(new Point(tb.getPdfMinX(), tb.getPdfMinY()),
tb.getPdfMaxX() - tb.getPdfMinX(),
tb.getPdfMaxY() - tb.getPdfMinY(),
i + 1,
paragraph.getPageBlocks().size(),
null));

View File

@ -292,9 +292,9 @@ public class EntitySearchUtils {
.get(0)
.getSequences()
.get(0)
.getX1() && image.getPosition().getX() + image.getPosition().getWidth() > entity.getPositionSequences().get(0).getSequences().get(0).getX2() && image.getPosition()
.getY() < entity.getPositionSequences().get(0).getSequences().get(0).getY1() && image.getPosition().getY() + image.getPosition()
.getHeight() > entity.getPositionSequences().get(0).getSequences().get(0).getY2();
.getMinXDirAdj() && image.getPosition().getX() + image.getPosition().getWidth() > entity.getPositionSequences().get(0).getSequences().get(0).getMaxXDirAdj() && image.getPosition()
.getY() < entity.getPositionSequences().get(0).getSequences().get(0).getMinYDirAdj() && image.getPosition().getY() + image.getPosition()
.getHeight() > entity.getPositionSequences().get(0).getSequences().get(0).getMaxYDirAdj();
}

View File

@ -99,24 +99,22 @@ public class PdfSegmentationService {
stripper.getText(pdDocument);
PDRectangle pdr = pdPage.getMediaBox();
boolean isLandscape = pdr.getWidth() > pdr.getHeight();
int rotation = pdPage.getRotation();
boolean isRotated = rotation != 0 && rotation != 360;
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
stripper.getRulings(),
stripper.getMinCharWidth(),
stripper.getMaxCharHeight());
Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
PDRectangle cropbox = pdPage.getCropBox();
float cropboxArea = cropbox.getHeight() * cropbox.getWidth();
page.setCropBoxArea(cropboxArea);
page.setRotation(rotation);
page.setLandscape(isLandscape || isRotated);
page.setLandscape(isLandscape);
page.setPageNumber(pageNumber);
page.setPageWidth(cropbox.getWidth());
page.setPageHeight(cropbox.getHeight());
tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page);

View File

@ -4,6 +4,7 @@ import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.classification.model.Orientation;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -34,6 +35,12 @@ public abstract class AbstractTextContainer {
public abstract String getText();
public boolean containsBlock(TextBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractTextContainer other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;

View File

@ -51,7 +51,7 @@ public class Cell extends Rectangle {
for (TextPositionSequence word : textBlock.getSequences()) {
if (previous != null) {
if (Math.abs(previous.getRotationAdjustedY() - word.getRotationAdjustedY()) > word.getTextHeight()) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');

View File

@ -1,15 +1,27 @@
package com.iqser.red.service.redaction.v1.server.tableextraction.service;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.*;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import java.awt.geom.Point2D;
import java.util.*;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Rectangle;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils;
@Service
public class TableExtractionService {
@ -54,6 +66,19 @@ public class TableExtractionService {
};
/**
* Finds tables on a page and moves textblocks into cells of the found tables.
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* DirAdj (Text direction adjusted) values can not be used here.
*
* @param cleanRulings The lines used to build the table.
* @param page Page object that contains textblocks and statistics.
*/
public void extractTables(CleanRulings cleanRulings, Page page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
@ -63,7 +88,10 @@ public class TableExtractionService {
for (AbstractTextContainer abstractTextContainer : page.getTextBlocks()) {
TextBlock textBlock = (TextBlock) abstractTextContainer;
for (Cell cell : cells) {
if (cell.intersects(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight())) {
if (cell.intersects(textBlock.getPdfMinX(),
textBlock.getPdfMinY(),
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
cell.addTextBlock(textBlock);
toBeRemoved.add(textBlock);
break;
@ -94,7 +122,7 @@ public class TableExtractionService {
Iterator<AbstractTextContainer> itty = page.getTextBlocks().iterator();
while (itty.hasNext()) {
AbstractTextContainer textBlock = itty.next();
if (table.contains(textBlock) && position == -1) {
if (textBlock instanceof TextBlock ? table.containsBlock((TextBlock) textBlock) : table.contains(textBlock) && position == -1) {
position = page.getTextBlocks().indexOf(textBlock);
}
}

View File

@ -1,5 +1,15 @@
package com.iqser.red.service.redaction.v1.server.visualization.service;
import java.awt.Color;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
@ -11,16 +21,6 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.springframework.stereotype.Service;
import java.awt.Color;
import java.io.IOException;
import java.util.List;
@Slf4j
@Service
@RequiredArgsConstructor
@ -79,10 +79,11 @@ public class PdfVisualisationService {
}
contentStream.setStrokingColor(Color.YELLOW);
contentStream.addRect((float) analyzedPage.getBodyTextFrame().getX(),
(float) analyzedPage.getBodyTextFrame().getY(),
(float) analyzedPage.getBodyTextFrame().getWidth(),
(float) analyzedPage.getBodyTextFrame().getHeight());
contentStream.addRect(analyzedPage.getBodyTextFrame().getTopLeft().getX(),
analyzedPage.getBodyTextFrame().getTopLeft().getY(),
analyzedPage.getBodyTextFrame().getWidth(),
analyzedPage.getBodyTextFrame().getHeight());
contentStream.stroke();
contentStream.close();
@ -94,20 +95,39 @@ public class PdfVisualisationService {
contentStream.setStrokingColor(Color.RED);
contentStream.addRect(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight());
contentStream.addRect(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getPdfMaxX() - textBlock.getPdfMinX(), textBlock.getPdfMaxY() - textBlock.getPdfMinY());
contentStream.stroke();
if (textBlock.getClassification() != null) {
contentStream.beginText();
contentStream.setNonStrokingColor(Color.BLUE);
contentStream.setFont(PDType1Font.TIMES_ROMAN, 12f);
contentStream.setFont(PDType1Font.TIMES_ROMAN, 9f);
contentStream.newLineAtOffset(textBlock.getMinX(), textBlock.getMaxY());
contentStream.showText(textBlock.getClassification() + textBlock.getOrientation());
contentStream.newLineAtOffset(textBlock.getPdfMinX(), textBlock.getPdfMaxY() + 2);
contentStream.showText(textBlock.getClassification() + textBlock.getOrientation() + "-->" + textBlock.getSequences().get(0).getDir());
contentStream.endText();
contentStream.setNonStrokingColor(Color.BLUE);
contentStream.setFont(PDType1Font.TIMES_ROMAN, 2f);
// contentStream.beginText();
// contentStream.newLineAtOffset(textBlock.getPdfMinX(), textBlock.getPdfMinY());
// contentStream.showText("MinX,MinY(" + textBlock.getPdfMinX() + "," + textBlock.getPdfMinY() + ")");
// contentStream.endText();
// contentStream.beginText();
// contentStream.newLineAtOffset(textBlock.getPdfMaxX(), textBlock.getPdfMinY());
// contentStream.showText("MaxX,MinY(" + textBlock.getPdfMaxX() + "," + textBlock.getPdfMinY() + ")");
// contentStream.endText();
// contentStream.beginText();
// contentStream.newLineAtOffset(textBlock.getPdfMinX(), textBlock.getPdfMaxY());
// contentStream.showText("MinX,MaxY(" + textBlock.getPdfMinX() + "," + textBlock.getPdfMaxY() + ")");
// contentStream.endText();
// contentStream.beginText();
// contentStream.newLineAtOffset(textBlock.getPdfMaxX(), textBlock.getPdfMaxY());
// contentStream.showText("MaxX,MaxY(" + textBlock.getPdfMaxX() + "," + textBlock.getPdfMaxY() + ")");
// contentStream.endText();
}
}
@ -124,7 +144,10 @@ public class PdfVisualisationService {
contentStream.setStrokingColor(Color.GREEN);
for (TextBlock textBlock : cell.getTextBlocks()) {
contentStream.addRect(textBlock.getMinX(), textBlock.getMinY(), textBlock.getWidth(), textBlock.getHeight());
contentStream.addRect(textBlock.getPdfMinX(),
textBlock.getPdfMinY(),
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
textBlock.getPdfMaxY() - textBlock.getPdfMinY());
contentStream.stroke();
}
}

View File

@ -155,9 +155,8 @@ public class HeadlinesGoldStandardIntegrationTest {
System.out.println("Precision is: " + precision + " recall is: " + recall);
Assertions.assertThat(precision).isGreaterThanOrEqualTo(0.45f);
Assertions.assertThat(precision).isGreaterThanOrEqualTo(0.44f);
Assertions.assertThat(recall).isGreaterThanOrEqualTo(0.69f);
}

View File

@ -364,7 +364,7 @@ public class RedactionIntegrationTest {
@Test
public void titleExtraction() throws IOException {
AnalyzeRequest request = prepareStorage("files/RSS/32 - Emamectin Benzoate Technical - Acute Oral Toxicity - Mouse.pdf");
AnalyzeRequest request = prepareStorage("files/RSS/06 - Isopyrazam - Acute Oral Toxicity Rat.pdf");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
@ -1098,7 +1098,7 @@ public class RedactionIntegrationTest {
System.out.println("classificationTest");
AnalyzeRequest request = prepareStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
AnalyzeRequest request = prepareStorage("files/new/RotateTestFile.pdf");
RedactionRequest redactionRequest = RedactionRequest.builder()
.dossierId(request.getDossierId())