Compare commits

...

7 Commits
main ... 0.78.5

Author SHA1 Message Date
Dominique Eifländer
e030ec9dd2 Merge branch 'RED-9103' into 'release/0.78.x'
RED-9103: Fixed save of document viewer file

See merge request fforesight/layout-parser!144
2024-05-02 13:03:38 +02:00
Dominique Eifländer
49139ee603 RED-9103: Fixed save of document viewer file 2024-05-02 12:55:14 +02:00
Dominique Eifländer
07da43f2d9 hotfix: revert layoutparsingResponseQueue changes that is not in persistence-service, should be done in migration to 4.0.0 2024-04-24 15:24:19 +02:00
Dominique Eifländer
df0bbc92c7 RED-8932 Fixed not merged headline with identifier 2024-04-24 11:38:26 +02:00
Kilian Schüttler
0497d764ec Merge branch 'hotfix' into 'release/0.78.x'
hotfix: remove DLQ for layoutparsing finished queue

See merge request fforesight/layout-parser!129
2024-04-08 15:39:04 +02:00
Kilian Schuettler
1362e4fbb2 hotfix: remove DLQ for layoutparsing finished queue 2024-04-08 15:31:35 +02:00
Dominique Eifländer
665ad40b0b RED-8627: Fixed scrambled text after sorting 2024-03-19 14:46:04 +01:00
6 changed files with 33 additions and 39 deletions

View File

@ -16,6 +16,8 @@ deploy:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG
pmd:
allow_failure: true

View File

@ -196,6 +196,12 @@ public class TextPositionSequence implements CharSequence {
}
public float getTextHeightNoPadding() {
return textPositions.get(0).getHeightDir();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getTextHeight() {
@ -234,6 +240,7 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore
@JsonAttribute(ignore = true)
public String getFontStyle() {
if (textPositions.get(0).getFontName() == null) {
return "standard";
}

View File

@ -61,8 +61,8 @@ public class DocuMineBlockificationService {
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();

View File

@ -4,11 +4,9 @@ import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashSet;
import java.util.Set;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
@ -40,7 +38,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class ViewerDocumentService {
private static final String LAYER_NAME = "Layout grid";
private static final int FONT_SIZE = 10;
public static final float LINE_WIDTH = 1f;
@ -54,8 +51,7 @@ public class ViewerDocumentService {
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
@ -68,7 +64,8 @@ public class ViewerDocumentService {
// e.g. not escaped matrix transformations.
escapePreviousContents(pdDocument, pdPage);
VisualizationsOnPage visualizationsOnPage = layoutGrid.getVisualizationsPerPages().get(pageNumber);
VisualizationsOnPage visualizationsOnPage = layoutGrid.getVisualizationsPerPages()
.get(pageNumber);
assert pageNumber == visualizationsOnPage.getPageNumber();
// We need to append to the content stream, otherwise the content could be overlapped by following content.
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
@ -102,11 +99,11 @@ public class ViewerDocumentService {
contentStream.setFont(font, FONT_SIZE);
contentStream.beginText();
Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
(float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY());
(float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY());
textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
contentStream.setTextMatrix(textMatrix);
contentStream.showText(placedText.text());
@ -115,12 +112,9 @@ public class ViewerDocumentService {
contentStream.restoreGraphicsState();
contentStream.endMarkedContent();
}
dictionariesToUpdate.add(pdPage.getCOSObject());
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
}
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
pdDocument.save(outputStream, CompressParameters.NO_COMPRESSION);
}
@ -145,7 +139,7 @@ public class ViewerDocumentService {
}
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate, boolean layerVisibilityDefaultValue) {
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
PDOptionalContentProperties ocprops = catalog.getOCProperties();
@ -161,7 +155,6 @@ public class ViewerDocumentService {
ocprops.addGroup(layer);
}
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
dictionariesToUpdate.add(catalog.getCOSObject());
return layer;
}

View File

@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit
*
* @author Ben Litchfield
*/
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
{
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
@Override
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
{
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
// only compare text that is in the same direction
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
if (cmp1 != 0)
{
if (cmp1 != 0) {
return cmp1;
}
@ -48,25 +46,19 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
float pos2YBottom = pos2.getMaxYDirAdj();
// note that the coordinates have been adjusted so 0,0 is in upper left
float pos1YTop = pos1YBottom - pos1.getTextHeight();
float pos2YTop = pos2YBottom - pos2.getTextHeight();
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison
if (yDifference < .1 ||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
{
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
return Float.compare(x1, x2);
}
else if (pos1YBottom < pos2YBottom)
{
} else if (pos1YBottom < pos2YBottom) {
return -1;
}
else
{
} else {
return 1;
}
}
}