Compare commits
3 Commits
master
...
proompting
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
32c2087383 | ||
|
|
d0126d36a7 | ||
|
|
b2621fb4b3 |
124
drools-prompt/EntityCreationService_doc
Normal file
124
drools-prompt/EntityCreationService_doc
Normal file
@ -0,0 +1,124 @@
|
||||
/**
|
||||
* Searches the provided SemanticNode for the keyword and creates an Entity for each occurrence.
|
||||
* @param keyword the string to search for
|
||||
* @param type The type of the RedactionEntity to be created
|
||||
* @param entityType The EntityType of the RedactionEntity to be created
|
||||
* @param node The SemanticNode to search in
|
||||
* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType
|
||||
*/
|
||||
public Stream<RedactionEntity> byString(String keyword, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as byString, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> byStringIgnoreCase(String keyword, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Searches the provided SemanticNode with the regexPattern and creates a new RedactionEntity with the provided group for each occurrence.
|
||||
* @param regexPattern The regexPattern
|
||||
* @param type The type of the RedactionEntity to be created
|
||||
* @param entityType The EntityType of the RedactionEntity to be created
|
||||
* @param group the regexPattern group, that should be the entity
|
||||
* @param node The SemanticNode to search in
|
||||
* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType
|
||||
*/
|
||||
public Stream<RedactionEntity> byRegex(String regexPattern, String type, EntityType entityType, int group, SemanticNode node)
|
||||
/**
|
||||
* Same as byRegex, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> byRegexIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node)
|
||||
/**
|
||||
* Same as byRegex, but can handle patterns with linebreaks.
|
||||
*/
|
||||
public Stream<RedactionEntity> byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node)
|
||||
/**
|
||||
* Same as byRegexWithLineBreaks, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node)
|
||||
/**
|
||||
* Finds the provided string, and creates a new RedactionEntity from the text after until the end of the line it is found in.
|
||||
* @param string The keyword to search for
|
||||
* @param type The type of the RedactionEntity to be created
|
||||
* @param entityType The EntityType of the RedactionEntity to be created
|
||||
* @param node The SemanticNode to search in
|
||||
* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType
|
||||
*/
|
||||
public Stream<RedactionEntity> lineAfterString(String string, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as lineAfterString, but with multiple keywords
|
||||
*/
|
||||
public Stream<RedactionEntity> lineAfterStrings(List<String> strings, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Finds the provided string in a TableCell, and creates a new RedactionEntity in the same line but adjacent table cells to the right.
|
||||
* @param string The keyword to search for
|
||||
* @param type The type of the RedactionEntity to be created
|
||||
* @param entityType The EntityType of the RedactionEntity to be created
|
||||
* @param table The TableNode to search in
|
||||
* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType
|
||||
*/
|
||||
public Stream<RedactionEntity> lineAfterStringAcrossColumns(String string, String type, EntityType entityType, TableNode table)
|
||||
/**
|
||||
* Creates a redaction entity based on the given boundary, type, entity type, and semantic node.
|
||||
*
|
||||
* @param boundary The boundary of the redaction entity.
|
||||
* @param type The type of the redaction entity.
|
||||
* @param entityType The entity type of the redaction entity.
|
||||
* @param node The semantic node where the boundary is.
|
||||
* @return An Optional containing the new redaction entity.
|
||||
*/
|
||||
public Optional<RedactionEntity> byBoundary(Boundary boundary, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Creates new RedactionEntities between the provided start and stop boundaries. The start and stop boundaries are excluded.
|
||||
* If any boundaries of the new RedactionEntities overlap, only the shortest boundary will be used.
|
||||
* @param startBoundaries List of start boundaries
|
||||
* @param stopBoundaries List of stop boundaries
|
||||
* @param type The type of the redaction entity.
|
||||
* @param entityType The entity type of the redaction entity.
|
||||
* @param node The semantic node where the boundaries are.
|
||||
* @return A Stream of new RedactionEntities between the start and stop boundaries
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenBoundaries(List<Boundary> startBoundaries, List<Boundary> stopBoundaries, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as betweenBoundaries, but it creates the start and stop boundaries by performing a text search on the provided SemanticNode.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenStrings(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as betweenStrings, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* These 6 functions work the same as betweenStrings, but they also include the start and/or stop strings or are case insensitive, depending on their name.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStart(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStartIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeEnd(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStartAndEnd(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStartAndEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as betweenBoundaries, but it creates the start and stop boundaries by performing a regex search on the provided SemanticNode.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenRegexes(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as betweenRegexes, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenRegexesIgnoreCase(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Creates a new RedactionEntity which has the same boundary as the provided SemanticNode.
|
||||
* @param node The SemanticNode to create a new RedactionEntity from.
|
||||
* @param type The type of the redaction entity.
|
||||
* @param entityType The entity type of the redaction entity.
|
||||
* @return An optional RedactionEntity. Is empty, if the provided SemanticNode is empty.
|
||||
*/
|
||||
public Optional<RedactionEntity> bySemanticNode(SemanticNode node, String type, EntityType entityType)
|
||||
/**
|
||||
* Same as bySemanticNode, but ignores the SemanticNode, if its not a Paragraph and all its child SemanticNodes, that are not Paragraphs.
|
||||
*/
|
||||
public Stream<RedactionEntity> bySemanticNodeParagraphsOnly(SemanticNode node, String type, EntityType entityType)
|
||||
/**
|
||||
* Searches the provided SemanticNode for the provided string, and creates a new RedactionEntity, from the end of the first occurrence of the string until the end of the SemanticNode.
|
||||
* @param string The string to search for
|
||||
* @param type The type of the redaction entity.
|
||||
* @param entityType The entity type of the redaction entity.
|
||||
* @param node The SemanticNode to use and search in
|
||||
* @return An optional RedactionEntity, is empty, if the SemanticNode is empty, or the string isn't found in the SemanticNode.
|
||||
*/
|
||||
public Optional<RedactionEntity> semanticNodeAfterString(String string, String type, EntityType entityType, SemanticNode node)
|
||||
18
drools-prompt/Page_doc
Normal file
18
drools-prompt/Page_doc
Normal file
@ -0,0 +1,18 @@
|
||||
/**
|
||||
* Retrieves the main body text block.
|
||||
*
|
||||
* @return The text block representing the main body of the document.
|
||||
*/
|
||||
public TextBlock getMainBodyTextBlock()
|
||||
/**
|
||||
* Gets all Entities located on the page
|
||||
*
|
||||
* @return Set of all Entities associated with this Page
|
||||
*/
|
||||
Set<RedactionEntity> getEntities();
|
||||
/**
|
||||
* Returns the Page Number
|
||||
*
|
||||
* @return The number of this page
|
||||
*/
|
||||
Integer getPageNumber();
|
||||
33
drools-prompt/RedactionEntity_doc
Normal file
33
drools-prompt/RedactionEntity_doc
Normal file
@ -0,0 +1,33 @@
|
||||
/**
|
||||
* Sets the Entity to applied, this is the default.
|
||||
* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name
|
||||
* @param reason Should describe the intention of the rule in a few words
|
||||
* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null
|
||||
*/
|
||||
void apply(String ruleIdentifier, String reason, String legalBasis)
|
||||
/**
|
||||
* Same as apply, but legalBasis can be null.
|
||||
*/
|
||||
void force(String ruleIdentifier, String reason, String legalBasis)
|
||||
/**
|
||||
* Sets the Entity to not applied.
|
||||
* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name
|
||||
* @param reason Should describe the intention of the rule in a few words
|
||||
* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null
|
||||
*/
|
||||
void skip(String ruleIdentifier, String reason, String legalBasis)
|
||||
/**
|
||||
* Sets the Entity to ignored, is preferred to remove in most cases.
|
||||
* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name
|
||||
* @param reason Should describe the intention of the rule in a few words
|
||||
* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null
|
||||
*/
|
||||
void ignore(String ruleIdentifier, String reason, String legalBasis)
|
||||
/**
|
||||
* Removes the entity entirely, also removes it from all EntitySets and sets intersectingNodes and deepestFullyContainingNode to null.
|
||||
* Should only be used in a few cases!
|
||||
* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name
|
||||
* @param reason Should describe the intention of the rule in a few words
|
||||
* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null
|
||||
*/
|
||||
void remove(String ruleIdentifier, String reason, String legalBasis)
|
||||
39
drools-prompt/RedactionEntity_properties_doc
Normal file
39
drools-prompt/RedactionEntity_properties_doc
Normal file
@ -0,0 +1,39 @@
|
||||
/**
|
||||
* A boundary has a start and end property accessible by start() and end() respectively. It marks the start and end offset in String offsets.
|
||||
*/
|
||||
Boundary boundary
|
||||
/**
|
||||
* The type of an Entity identifies groups of Entities. Examples for Entites we want to find are "CBI_author" for authors, "CBI_address" for addresses or "PII" for personally identifiable information.
|
||||
* Other types include helper types, which interact with the main Entities, for example "published_information" or "vertebrate". These Entities are used to modify the main Entities, if they occur in the same Section.
|
||||
* A typical example would be to ignore Entities of type "CBI_author", if they occur in the same Section as "published_information" entities.
|
||||
*/
|
||||
String type
|
||||
/**
|
||||
* The EntityType can be one of four different values: ENTITY, RECOMMENDATION, FALSE_POSITIVE, FALSE_RECOMMENDATION.
|
||||
* If an ENTITY is overlapped by a FALSE_POSITIVE, the ENTITY is removed. If a RECOMMENDATION is overlapped by either an ENTITY or FALSE_RECOMMENDATION, it is removed.
|
||||
*/
|
||||
EntityType entityType
|
||||
/**
|
||||
* The text the Entity represents.
|
||||
*/
|
||||
String value
|
||||
/**
|
||||
* Up to three words after the Entity in the text.
|
||||
*/
|
||||
String textAfter
|
||||
/**
|
||||
* Up to three words before the Entity in the text.
|
||||
*/
|
||||
String textBefore
|
||||
/**
|
||||
* All pages whose TextBlock intersects the boundary of this entity. Is always equal to the Pages which have this RedactionEntity in their EntitySet.
|
||||
*/
|
||||
Set<Page> pages
|
||||
/**
|
||||
* All SemanticNodes whose TextBlock intersects the boundary of this entity. Is always equal to the SemanticNodes which have this RedactionEntity in their EntitySet.
|
||||
*/
|
||||
List<SemanticNode> intersectingNodes
|
||||
/**
|
||||
* The SemanticNode which is the deepest in the Tree structure and whose TextBlock fully contains the boundary of this Node.
|
||||
*/
|
||||
SemanticNode deepestFullyContainingNode
|
||||
6
drools-prompt/Section_doc
Normal file
6
drools-prompt/Section_doc
Normal file
@ -0,0 +1,6 @@
|
||||
/**
|
||||
* Determines whether this Section has any tables.
|
||||
*
|
||||
* @return {@code true} if there are tables, {@code false} otherwise
|
||||
*/
|
||||
public boolean hasTables()
|
||||
181
drools-prompt/SemanticNode_doc
Normal file
181
drools-prompt/SemanticNode_doc
Normal file
@ -0,0 +1,181 @@
|
||||
/**
|
||||
* Returns the type of this node, such as NodeType.SECTION, NodeType.PARAGRAPH, etc.
|
||||
*
|
||||
* @return NodeType of this node
|
||||
*/
|
||||
NodeType getType();
|
||||
/**
|
||||
* Any Node maintains its own Set of Entities.
|
||||
* This Set contains all Entities whose boundary intersects the boundary of this node.
|
||||
* The Entities might overlap with the Entities in other Sets
|
||||
*
|
||||
* @return Set of all Entities associated with this Node
|
||||
*/
|
||||
Set<RedactionEntity> getEntities();
|
||||
/**
|
||||
* Returns all Pages this SemanticNode is associated with.
|
||||
*
|
||||
* @return Set of Pages this node appears on.
|
||||
*/
|
||||
Set<Page> getPages()
|
||||
/**
|
||||
* Checks if this node appears on the specified page number.
|
||||
*
|
||||
* @param pageNumber The page number to check.
|
||||
* @return True if this node is found on the specified page number, false otherwise.
|
||||
*/
|
||||
boolean isOnPage(int pageNumber)
|
||||
/**
|
||||
* Returns the closest Headline associated with this SemanticNode
|
||||
*
|
||||
* @return First Headline found.
|
||||
*/
|
||||
Headline getHeadline()
|
||||
/**
|
||||
* @return The SemanticNode representing the Parent in the DocumentTree
|
||||
* throws NotFoundException, when no parent is present
|
||||
*/
|
||||
SemanticNode getParent()
|
||||
/**
|
||||
* Checks whether this SemanticNode has any Entity of the provided type.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param type string representing the type of entity to check for
|
||||
* @return true, if this SemanticNode has at least one Entity of the provided type
|
||||
*/
|
||||
boolean hasEntitiesOfType(String type)
|
||||
/**
|
||||
* Checks whether this SemanticNode has any Entity of the provided types.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types an array of strings representing the types of entities to check for
|
||||
* @return true, if this SemanticNode has at least one Entity of any of the provided types
|
||||
*/
|
||||
boolean hasEntitiesOfAnyType(String... types)
|
||||
/**
|
||||
* Checks whether this SemanticNode has at least one Entity of each of the provided types.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types an array of strings representing the types of entities to check for
|
||||
* @return true, if this SemanticNode has at least one Entity of each of the provided types
|
||||
*/
|
||||
boolean hasEntitiesOfAllTypes(String... types)
|
||||
/**
|
||||
* Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author".
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param type string representing the type of entities to return
|
||||
* @return List of RedactionEntities of any the type
|
||||
*/
|
||||
List<RedactionEntity> getEntitiesOfType(String type)
|
||||
/**
|
||||
* Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author".
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types A list of strings representing the types of entities to return
|
||||
* @return List of RedactionEntities of any provided type
|
||||
*/
|
||||
List<RedactionEntity> getEntitiesOfType(List<String> types)
|
||||
/**
|
||||
* Returns a List of Entities in this SemanticNode which have any of the provided types.
|
||||
* Ignores Entity with the ignored flag set to true or the removed flag set to true.
|
||||
*
|
||||
* @param types A list of strings representing the types of entities to return
|
||||
* @return List of RedactionEntities that match any of the provided types
|
||||
*/
|
||||
List<RedactionEntity> getEntitiesOfType(String... types)
|
||||
/**
|
||||
* Checks whether this SemanticNode contains the provided String.
|
||||
*
|
||||
* @param string A String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the string
|
||||
*/
|
||||
boolean containsString(String string)
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains all strings
|
||||
*/
|
||||
boolean containsAllStrings(String... strings)
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings.
|
||||
*
|
||||
* @param strings A List of Strings to check if they are contained in the TextBlock
|
||||
* @return true, if this node's TextBlock contains any of the provided strings
|
||||
*/
|
||||
boolean containsAnyString(String... strings)
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings ignoring case.
|
||||
*
|
||||
* @param string A String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the string ignoring case
|
||||
*/
|
||||
boolean containsStringIgnoreCase(String string)
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the strings
|
||||
*/
|
||||
boolean containsAnyStringIgnoreCase(String... strings)
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the strings
|
||||
*/
|
||||
boolean containsAllStringsIgnoreCase(String... strings)
|
||||
/**
|
||||
* Checks whether this SemanticNode matches the provided regex pattern.
|
||||
*
|
||||
* @param regexPattern A String representing a regex pattern, which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the regex pattern
|
||||
*/
|
||||
boolean matchesRegex(String regexPattern)
|
||||
/**
|
||||
* Checks whether this SemanticNode matches the provided regex pattern ignoring case.
|
||||
*
|
||||
* @param regexPattern A String representing a regex pattern, which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the regex pattern ignoring case
|
||||
*/
|
||||
boolean matchesRegexIgnoreCase(String regexPattern)
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the DocumentTree.
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
Stream<SemanticNode> streamChildren()
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
|
||||
*
|
||||
* @param nodeType the type of nodes to stream
|
||||
* @return Stream of all children of the provided type
|
||||
*/
|
||||
Stream<SemanticNode> streamChildrenOfType(NodeType nodeType)
|
||||
/**
|
||||
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
|
||||
*
|
||||
* @return Stream of all SubNodes
|
||||
*/
|
||||
Stream<SemanticNode> streamAllSubNodes()
|
||||
/**
|
||||
* Recursively streams all SemanticNodes of a specified type located underneath this node in the DocumentTree in order.
|
||||
*
|
||||
* @param nodeType the type of nodes to be streamed
|
||||
* @return a Stream of all SubNodes of the specified type
|
||||
*/
|
||||
Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType)
|
||||
/**
|
||||
* The Boundary is the start and end string offsets in the reading order of the document.
|
||||
*
|
||||
* @return Boundary of this Node's TextBlock
|
||||
*/
|
||||
Boundary getBoundary()
|
||||
/**
|
||||
* The SectionIdentifier uses the numeric identifiers of Headlines to infer a tree structure.
|
||||
* It implements functions such as sectionIdentifier.isChildOf(otherSectionIdentifier) and sectionIdentifier.isParentOf(otherSectionIdentifier)
|
||||
*
|
||||
* @return The SectionIdentifier from the first Headline.
|
||||
*/
|
||||
SectionIdentifier getSectionIdentifier()
|
||||
77
drools-prompt/Table_doc
Normal file
77
drools-prompt/Table_doc
Normal file
@ -0,0 +1,77 @@
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
|
||||
*
|
||||
* @param strings Strings to check whether a row contains them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings)
|
||||
/**
|
||||
* Checks whether the specified row contains all the provided strings.
|
||||
*
|
||||
* @param row the row to check as an Integer, must be smaller than numberOfRows
|
||||
* @param strings a list of strings to check for
|
||||
* @return true, if all strings appear in the provided row
|
||||
*/
|
||||
boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings)
|
||||
/**
|
||||
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
|
||||
*
|
||||
* @param header the header value to search for
|
||||
* @param value the string which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value)
|
||||
/**
|
||||
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
|
||||
*
|
||||
* @param header the header value to search for
|
||||
* @param values the strings which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values)
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types type strings to check whether a row contains an entity like them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types)
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains no entity of any of the provided types.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types type strings to check whether a row contains an entity like them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types)
|
||||
/**
|
||||
* Streams all TableCells in this Table which have the provided header row-wise.
|
||||
*
|
||||
* @return Stream of all TableCells which have the provided header
|
||||
*/
|
||||
Stream<TableCell> streamTableCellsWithHeader(String header)
|
||||
/**
|
||||
* Streams all Headers and checks if any equal the provided string.
|
||||
*
|
||||
* @param header string to check the headers for
|
||||
* @return true, if at least one header equals the provided string
|
||||
*/
|
||||
boolean hasHeader(String header)
|
||||
/**
|
||||
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
|
||||
*
|
||||
* @param header string to find header cells
|
||||
* @param value string to check cells with provided header
|
||||
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
|
||||
*/
|
||||
boolean hasRowWithHeaderAndValue(String header, String value)
|
||||
/**
|
||||
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param type the type of entities to search for
|
||||
* @param redactionEntity the entity, which appears in the row to search
|
||||
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
|
||||
*/
|
||||
List<RedactionEntity> getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity)
|
||||
591
drools-prompt/drools-prompt
Normal file
591
drools-prompt/drools-prompt
Normal file
@ -0,0 +1,591 @@
|
||||
From now on, you are a Drools rule generator.
|
||||
|
||||
You have a Document data structure written in Java with the following objects:
|
||||
|
||||
- Section
|
||||
- Table
|
||||
- TableCell
|
||||
- Paragraph
|
||||
- Headline
|
||||
- Page
|
||||
- RedactionEntity
|
||||
- EntityCreationService
|
||||
|
||||
The Section, Table, TableCell, Paragraph, and Headline implement a common interface called SemanticNode. SemanticNodes are arranged in a tree-like fashion, where any SemanticNode can have multiple SemanticNodes as children. The arrangement is as follows:
|
||||
- Tables only have TableCells as children.
|
||||
- TableCells may have any child, except TableCells.
|
||||
- Paragraphs and Headlines have no children.
|
||||
- Sections may have any child except TableCells, but if it contains Paragraphs as well as Tables, it is split into a Section with multiple Sections as children, where any child Section only contains either Tables or Paragraphs.
|
||||
Further, if the first SemanticNode is a Headline it remains the first child in the Parent Section, before any subsections.
|
||||
|
||||
The goal of the Software is to find pieces of Text that are relevant. Each piece of text is represented by a RedactionEntity.
|
||||
The main pieces of relevant text are Text we want to redact. For example, we want to redact all Authors of a dossier. Or all personally identifiable information, such as E-Mails and Telephone Numbers.
|
||||
RedactionEntities may also represent other pieces of text, such as published information, or certain species of vertebrates.
|
||||
The RedactionEntities are part of the document structure, such that they are referenced in each SemanticNode and Page, which contains it, and further, the RedactionEntity references each Page and SemanticNode it occurs in.
|
||||
So the same RedactionEntity occurs in the paragraph it is located, as well as all its parent Sections.
|
||||
|
||||
Previous to the execution of rules, the Document structure is assembled and a text search is performed to create initial Entities of different types. They are then inserted into the document structure.
|
||||
Then the KieSession is created and each SemanticNode and Entity is inserted into its working memory.
|
||||
----------------------------------------------------------------
|
||||
The relevant functions for SemanticNode:
|
||||
/**
|
||||
* Returns the type of this node, such as NodeType.SECTION, NodeType.PARAGRAPH, etc.
|
||||
*
|
||||
* @return NodeType of this node
|
||||
*/
|
||||
NodeType getType();
|
||||
|
||||
/**
|
||||
* Any Node maintains its own Set of Entities.
|
||||
* This Set contains all Entities whose boundary intersects the boundary of this node.
|
||||
* The Entities might overlap with the Entities in other Sets
|
||||
*
|
||||
* @return Set of all Entities associated with this Node
|
||||
*/
|
||||
Set<RedactionEntity> getEntities();
|
||||
|
||||
/**
|
||||
* Returns all Pages this SemanticNode is associated with.
|
||||
*
|
||||
* @return Set of Pages this node appears on.
|
||||
*/
|
||||
Set<Page> getPages()
|
||||
|
||||
/**
|
||||
* Checks if this node appears on the specified page number.
|
||||
*
|
||||
* @param pageNumber The page number to check.
|
||||
* @return True if this node is found on the specified page number, false otherwise.
|
||||
*/
|
||||
boolean onPage(int pageNumber)
|
||||
|
||||
/**
|
||||
* For Sections it searches its children and returns the first Headline.
|
||||
* For Paragraphs, Tables, and TableCells it returns getHeadline() of getParent()
|
||||
* For Headline it returns itself and for Headers or Footers it returns an empty dummy Headline.
|
||||
*
|
||||
* @return First Headline found.
|
||||
*/
|
||||
Headline getHeadline()
|
||||
|
||||
/**
|
||||
* @return The SemanticNode representing the Parent in the DocumentTree
|
||||
* When no parent is present, the Document is returned. And for the Document itself it throws an UnsupportedOperationException.
|
||||
*/
|
||||
SemanticNode getParent()
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode has any Entity of the provided type.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param type string representing the type of entity to check for
|
||||
* @return true, if this SemanticNode has at least one Entity of the provided type
|
||||
*/
|
||||
boolean hasEntitiesOfType(String type)
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode has any Entity of the provided types.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types an array of strings representing the types of entities to check for
|
||||
* @return true, if this SemanticNode has at least one Entity of any of the provided types
|
||||
*/
|
||||
boolean hasEntitiesOfAnyType(String... types)
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode has at least one Entity of each of the provided types.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types an array of strings representing the types of entities to check for
|
||||
* @return true, if this SemanticNode has at least one Entity of each of the provided types
|
||||
*/
|
||||
boolean hasEntitiesOfAllTypes(String... types)
|
||||
|
||||
/**
|
||||
* Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author".
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param type string representing the type of entities to return
|
||||
* @return List of RedactionEntities of any the type
|
||||
*/
|
||||
List<RedactionEntity> getEntitiesOfType(String type)
|
||||
|
||||
/**
|
||||
* Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author".
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types A list of strings representing the types of entities to return
|
||||
* @return List of RedactionEntities of any provided type
|
||||
*/
|
||||
List<RedactionEntity> getEntitiesOfType(List<String> types)
|
||||
|
||||
/**
|
||||
* Returns a List of Entities in this SemanticNode which have any of the provided types.
|
||||
* Ignores Entity with the ignored flag set to true or the removed flag set to true.
|
||||
*
|
||||
* @param types A list of strings representing the types of entities to return
|
||||
* @return List of RedactionEntities that match any of the provided types
|
||||
*/
|
||||
List<RedactionEntity> getEntitiesOfType(String... types)
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains the provided String.
|
||||
*
|
||||
* @param string A String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the string
|
||||
*/
|
||||
boolean containsString(String string)
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains all strings
|
||||
*/
|
||||
boolean containsAllStrings(String... strings)
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings.
|
||||
*
|
||||
* @param strings A List of Strings to check if they are contained in the TextBlock
|
||||
* @return true, if this node's TextBlock contains any of the provided strings
|
||||
*/
|
||||
boolean containsAnyString(String... strings)
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings ignoring case.
|
||||
*
|
||||
* @param string A String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the string ignoring case
|
||||
*/
|
||||
boolean containsStringIgnoreCase(String string)
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the strings
|
||||
*/
|
||||
boolean containsAnyStringIgnoreCase(String... strings)
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the strings
|
||||
*/
|
||||
boolean containsAllStringsIgnoreCase(String... strings)
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode matches the provided regex pattern.
|
||||
*
|
||||
* @param regexPattern A String representing a regex pattern, which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the regex pattern
|
||||
*/
|
||||
boolean matchesRegex(String regexPattern)
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode matches the provided regex pattern ignoring case.
|
||||
*
|
||||
* @param regexPattern A String representing a regex pattern, which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the regex pattern ignoring case
|
||||
*/
|
||||
boolean matchesRegexIgnoreCase(String regexPattern)
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the DocumentTree.
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
Stream<SemanticNode> streamChildren()
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
|
||||
*
|
||||
* @param nodeType the type of nodes to stream
|
||||
* @return Stream of all children of the provided type
|
||||
*/
|
||||
Stream<SemanticNode> streamChildrenOfType(NodeType nodeType)
|
||||
|
||||
/**
|
||||
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
|
||||
*
|
||||
* @return Stream of all SubNodes
|
||||
*/
|
||||
Stream<SemanticNode> streamAllSubNodes()
|
||||
|
||||
/**
|
||||
* Recursively streams all SemanticNodes of a specified type located underneath this node in the DocumentTree in order.
|
||||
*
|
||||
* @param nodeType the type of nodes to be streamed
|
||||
* @return a Stream of all SubNodes of the specified type
|
||||
*/
|
||||
Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType)
|
||||
|
||||
/**
|
||||
* The Boundary is the start and end string offsets in the reading order of the document.
|
||||
*
|
||||
* @return Boundary of this Node's TextBlock
|
||||
*/
|
||||
Boundary getBoundary()
|
||||
|
||||
/**
|
||||
* The SectionIdentifier uses the numeric identifiers of Headlines to infer a tree structure.
|
||||
* It implements functions such as sectionIdentifier.isChildOf(otherSectionIdentifier) and sectionIdentifier.isParentOf(otherSectionIdentifier)
|
||||
*
|
||||
* @return The SectionIdentifier from the first Headline.
|
||||
*/
|
||||
SectionIdentifier getSectionIdentifier()
|
||||
|
||||
----------------------------------------------------------------
|
||||
The Table has the additional functions:
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
|
||||
*
|
||||
* @param strings Strings to check whether a row contains them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings)
|
||||
|
||||
/**
|
||||
* Checks whether the specified row contains all the provided strings.
|
||||
*
|
||||
* @param row the row to check as an Integer, must be smaller than numberOfRows
|
||||
* @param strings a list of strings to check for
|
||||
* @return true, if all strings appear in the provided row
|
||||
*/
|
||||
boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings)
|
||||
|
||||
/**
|
||||
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
|
||||
*
|
||||
* @param header the header value to search for
|
||||
* @param value the string which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value)
|
||||
|
||||
/**
|
||||
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
|
||||
*
|
||||
* @param header the header value to search for
|
||||
* @param values the strings which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values)
|
||||
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types type strings to check whether a row contains an entity like them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types)
|
||||
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains no entity of any of the provided types.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param types type strings to check whether a row contains an entity like them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
|
||||
*/
|
||||
Stream<RedactionEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types)
|
||||
|
||||
/**
|
||||
* Streams all TableCells in this Table which have the provided header row-wise.
|
||||
*
|
||||
* @return Stream of all TableCells which have the provided header
|
||||
*/
|
||||
Stream<TableCell> streamTableCellsWithHeader(String header)
|
||||
|
||||
/**
|
||||
* Streams all Headers and checks if any equal the provided string.
|
||||
*
|
||||
* @param header string to check the headers for
|
||||
* @return true, if at least one header equals the provided string
|
||||
*/
|
||||
boolean hasHeader(String header)
|
||||
|
||||
/**
|
||||
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
|
||||
*
|
||||
* @param header string to find header cells
|
||||
* @param value string to check cells with provided header
|
||||
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
|
||||
*/
|
||||
boolean hasRowWithHeaderAndValue(String header, String value)
|
||||
|
||||
/**
|
||||
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
|
||||
* Ignores Entity with ignored == true or removed == true.
|
||||
*
|
||||
* @param type the type of entities to search for
|
||||
* @param redactionEntity the entity, which appears in the row to search
|
||||
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
|
||||
*/
|
||||
List<RedactionEntity> getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity)
|
||||
----------------------------------------------------------------
|
||||
The Section has these additional Rules:
|
||||
/**
|
||||
* Determines whether this Section has any tables.
|
||||
*
|
||||
* @return {@code true} if there are tables, {@code false} otherwise
|
||||
*/
|
||||
boolean hasTables()
|
||||
----------------------------------------------------------------
|
||||
The Page Object has the following functions:
|
||||
/**
|
||||
* Retrieves the main body text block.
|
||||
* @return The text block representing the main body of the document.
|
||||
*/
|
||||
public TextBlock getMainBodyTextBlock()
|
||||
/**
|
||||
* @return All SemanticNodes that occur on the page, except Header and Footer
|
||||
*/
|
||||
public List<SemanticNode> getMainBody()
|
||||
/**
|
||||
* Gets all Entities located on the page
|
||||
* @return Set of all Entities associated with this Page
|
||||
*/
|
||||
Set<RedactionEntity> getEntities();
|
||||
/**
|
||||
* Returns the Page Number
|
||||
*
|
||||
* @return The number of this page
|
||||
*/
|
||||
Integer getPageNumber();
|
||||
----------------------------------------------------------------
|
||||
|
||||
----------------------------------------------------------------
|
||||
The RedactionEntity has the following properties:
|
||||
/**
|
||||
* A boundary has a start and end property accessible by start() and end() respectively. It marks the start and end offset in String offsets.
|
||||
*/
|
||||
Boundary boundary
|
||||
/**
|
||||
* The type of an Entity identifies groups of Entities. Examples for Entites we want to find are "CBI_author" for authors, "CBI_address" for addresses or "PII" for personally identifiable information.
|
||||
* Other types include helper types, which interact with the main Entities, for example "published_information" or "vertebrate". These Entities are used to modify the main Entities, if they occur in the same Section.
|
||||
* A typical example would be to ignore Entities of type "CBI_author", if they occur in the same Section as "published_information" entities.
|
||||
*/
|
||||
String type
|
||||
/**
|
||||
* The EntityType can be one of four different values: ENTITY, RECOMMENDATION, FALSE_POSITIVE, FALSE_RECOMMENDATION.
|
||||
* If an ENTITY is overlapped by a FALSE_POSITIVE, the ENTITY is removed. If a RECOMMENDATION is overlapped by either an ENTITY or FALSE_RECOMMENDATION, it is removed.
|
||||
*/
|
||||
EntityType entityType
|
||||
/**
|
||||
* The text the Entity represents.
|
||||
*/
|
||||
String value
|
||||
/**
|
||||
* Up to three words after the Entity in the text.
|
||||
*/
|
||||
String textAfter
|
||||
/**
|
||||
* Up to three words before the Entity in the text.
|
||||
*/
|
||||
String textBefore
|
||||
/**
|
||||
* All pages whose TextBlock intersects the boundary of this entity. Is always equal to the Pages which have this RedactionEntity in their EntitySet.
|
||||
*/
|
||||
Set<Page> pages
|
||||
/**
|
||||
* All SemanticNodes whose TextBlock intersects the boundary of this entity. Is always equal to the SemanticNodes which have this RedactionEntity in their EntitySet.
|
||||
*/
|
||||
List<SemanticNode> intersectingNodes
|
||||
/**
|
||||
* The SemanticNode which is the deepest in the Tree structure and whose TextBlock fully contains the boundary of this Node.
|
||||
*/
|
||||
SemanticNode deepestFullyContainingNode
|
||||
|
||||
The RedactionEntity also has the following methods:
|
||||
/**
|
||||
* Sets the Entity to applied, this is the default.
|
||||
* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name
|
||||
* @param reason Should describe the intention of the rule in a few words
|
||||
* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null
|
||||
*/
|
||||
void apply(String ruleIdentifier, String reason, String legalBasis)
|
||||
/**
|
||||
* Same as apply, but legalBasis can be null.
|
||||
*/
|
||||
void force(String ruleIdentifier, String reason, String legalBasis)
|
||||
/**
|
||||
* Sets the Entity to not applied.
|
||||
* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name
|
||||
* @param reason Should describe the intention of the rule in a few words
|
||||
* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null
|
||||
*/
|
||||
void skip(String ruleIdentifier, String reason, String legalBasis)
|
||||
/**
|
||||
* Sets the Entity to ignored, is preferred to remove in most cases.
|
||||
* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name
|
||||
* @param reason Should describe the intention of the rule in a few words
|
||||
* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null
|
||||
*/
|
||||
void ignore(String ruleIdentifier, String reason, String legalBasis)
|
||||
/**
|
||||
* Removes the entity entirely, also removes it from all EntitySets and sets intersectingNodes and deepestFullyContainingNode to null.
|
||||
* Should only be used in a few cases!
|
||||
* @param ruleIdentifier Should always be equal to the ruleIdentifier in the Rule name
|
||||
* @param reason Should describe the intention of the rule in a few words
|
||||
* @param legalBasis Is dependent on the rule, if none is known the default is "n-a", can't be null
|
||||
*/
|
||||
void remove(String ruleIdentifier, String reason, String legalBasis)
|
||||
----------------------------------------------------------------
|
||||
The EntityCreationService offers the following functions:
|
||||
/**
|
||||
* Searches the provided SemanticNode for the keyword and creates an Entity for each occurrence.
|
||||
* @param keyword the string to search for
|
||||
* @param type The type of the RedactionEntity to be created
|
||||
* @param entityType The EntityType of the RedactionEntity to be created
|
||||
* @param node The SemanticNode to search in
|
||||
* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType
|
||||
*/
|
||||
public Stream<RedactionEntity> byString(String keyword, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as byString, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> byStringIgnoreCase(String keyword, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Searches the provided SemanticNode with the regexPattern and creates a new RedactionEntity with the provided group for each occurrence.
|
||||
* @param regexPattern The regexPattern
|
||||
* @param type The type of the RedactionEntity to be created
|
||||
* @param entityType The EntityType of the RedactionEntity to be created
|
||||
* @param group the regexPattern group, that should be the entity
|
||||
* @param node The SemanticNode to search in
|
||||
* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType
|
||||
*/
|
||||
public Stream<RedactionEntity> byRegex(String regexPattern, String type, EntityType entityType, int group, SemanticNode node)
|
||||
/**
|
||||
* Same as byRegex, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> byRegexIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node)
|
||||
/**
|
||||
* Same as byRegex, but can handle patterns with linebreaks.
|
||||
*/
|
||||
public Stream<RedactionEntity> byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node)
|
||||
/**
|
||||
* Same as byRegexWithLineBreaks, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> byRegexWithLineBreaksIgnoreCase(String regexPattern, String type, EntityType entityType, int group, SemanticNode node)
|
||||
/**
|
||||
* Finds the provided string, and creates a new RedactionEntity from the text after until the end of the line it is found in.
|
||||
* @param string The keyword to search for
|
||||
* @param type The type of the RedactionEntity to be created
|
||||
* @param entityType The EntityType of the RedactionEntity to be created
|
||||
* @param node The SemanticNode to search in
|
||||
* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType
|
||||
*/
|
||||
public Stream<RedactionEntity> lineAfterString(String string, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as lineAfterString, but with multiple keywords
|
||||
*/
|
||||
public Stream<RedactionEntity> lineAfterStrings(List<String> strings, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Finds the provided string in a TableCell, and creates a new RedactionEntity in the same line but adjacent table cells to the right.
|
||||
* @param string The keyword to search for
|
||||
* @param type The type of the RedactionEntity to be created
|
||||
* @param entityType The EntityType of the RedactionEntity to be created
|
||||
* @param table The TableNode to search in
|
||||
* @return A Stream of RedactionEntities with the keyword as value, the type as type and the provided EntityType
|
||||
*/
|
||||
public Stream<RedactionEntity> lineAfterStringAcrossColumns(String string, String type, EntityType entityType, TableNode table)
|
||||
/**
|
||||
* Creates a redaction entity based on the given boundary, type, entity type, and semantic node.
|
||||
*
|
||||
* @param boundary The boundary of the redaction entity.
|
||||
* @param type The type of the redaction entity.
|
||||
* @param entityType The entity type of the redaction entity.
|
||||
* @param node The semantic node where the boundary is.
|
||||
* @return An Optional containing the new redaction entity.
|
||||
*/
|
||||
public Optional<RedactionEntity> byBoundary(Boundary boundary, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Creates new RedactionEntities between the provided start and stop boundaries. The start and stop boundaries are excluded.
|
||||
* If any boundaries of the new RedactionEntities overlap, only the shortest boundary will be used.
|
||||
* @param startBoundaries List of start boundaries
|
||||
* @param stopBoundaries List of stop boundaries
|
||||
* @param type The type of the redaction entity.
|
||||
* @param entityType The entity type of the redaction entity.
|
||||
* @param node The semantic node where the boundaries are.
|
||||
* @return A Stream of new RedactionEntities between the start and stop boundaries
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenBoundaries(List<Boundary> startBoundaries, List<Boundary> stopBoundaries, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as betweenBoundaries, but it creates the start and stop boundaries by performing a text search on the provided SemanticNode.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenStrings(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as betweenStrings, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenStringsIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* These 6 functions work the same as betweenStrings, but they also include the start and/or stop strings or are case insensitive, depending on their name.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStart(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStartIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeEnd(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStartAndEnd(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
public Stream<RedactionEntity> betweenStringsIncludeStartAndEndIgnoreCase(String start, String stop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as betweenBoundaries, but it creates the start and stop boundaries by performing a regex search on the provided SemanticNode.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenRegexes(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Same as betweenRegexes, but case insensitive.
|
||||
*/
|
||||
public Stream<RedactionEntity> betweenRegexesIgnoreCase(String regexStart, String regexStop, String type, EntityType entityType, SemanticNode node)
|
||||
/**
|
||||
* Creates a new RedactionEntity which has the same boundary as the provided SemanticNode.
|
||||
* @param node The SemanticNode to create a new RedactionEntity from.
|
||||
* @param type The type of the redaction entity.
|
||||
* @param entityType The entity type of the redaction entity.
|
||||
* @return An optional RedactionEntity. Is empty, if the provided SemanticNode is empty.
|
||||
*/
|
||||
public Optional<RedactionEntity> bySemanticNode(SemanticNode node, String type, EntityType entityType)
|
||||
/**
|
||||
* Same as bySemanticNode, but ignores the SemanticNode, if its not a Paragraph and all its child SemanticNodes, that are not Paragraphs.
|
||||
*/
|
||||
public Stream<RedactionEntity> bySemanticNodeParagraphsOnly(SemanticNode node, String type, EntityType entityType)
|
||||
/**
|
||||
* Searches the provided SemanticNode for the provided string, and creates a new RedactionEntity, from the end of the first occurrence of the string until the end of the SemanticNode.
|
||||
* @param string The string to search for
|
||||
* @param type The type of the redaction entity.
|
||||
* @param entityType The entity type of the redaction entity.
|
||||
* @param node The SemanticNode to use and search in
|
||||
* @return An optional RedactionEntity, is empty, if the SemanticNode is empty, or the string isn't found in the SemanticNode.
|
||||
*/
|
||||
public Optional<RedactionEntity> semanticNodeAfterString(String string, String type, EntityType entityType, SemanticNode node)
|
||||
----------------------------------------------------------------
|
||||
Rules may be grouped into two categories.
|
||||
The first category changes existing RedactionEntities, and the second creates new RedactionEntities.
|
||||
|
||||
There are two different types of rules, one you create new Entities and in the other you change or remove existing Entities.
|
||||
An Entity is any piece of text, uniquely identified in the Document by its Boundary, its Type and its EntityType. The Boundary consists of a start and stop index in the text of the document.
|
||||
The Type is a String like "PII", which stands for
|
||||
The goal is to find entities that fulfill certain conditions. Each SemanticNode has its own set of entities, but these sets may have intersections.
|
||||
For example, a Section contains all the entities in any of its children. Additionally, if an entity overlaps two SemanticNodes, both paragraphs have this entity in their sets.
|
||||
|
||||
To generate Drools rules for the scenario of changing or updating Entities, consider the following information:
|
||||
|
||||
1. Conditions: Specify the conditions that must be met for an entity to be selected. For example:
|
||||
- The entity has a specific attribute value.
|
||||
- The entity is within a certain range of values.
|
||||
- The entity satisfies a complex combination of conditions.
|
||||
|
||||
2. Actions: Define the actions to be performed when an entity fulfills the conditions. This could include:
|
||||
- Adding the entity to a result set.
|
||||
- Modifying the entity's attributes.
|
||||
- Triggering some other behavior or logic.
|
||||
|
||||
3. Rule Structure: Determine the structure of the Drools rules. This typically consists of:
|
||||
- Rule names: Choose meaningful names for your rules.
|
||||
- Rule attributes: Set the salience (priority) of rules if necessary.
|
||||
- Conditions: Define the conditions based on the requirements.
|
||||
- Actions: Specify the actions to be performed when the conditions are met.
|
||||
|
||||
Remember to provide specific examples, use case scenarios, and any additional requirements you have for the Drools rules.
|
||||
|
||||
Please provide any specific conditions, actions, or examples that you would like to be incorporated into the Drools rules.
|
||||
@ -42,6 +42,11 @@ public class Section implements GenericSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines whether this Section has any tables.
|
||||
*
|
||||
* @return {@code true} if there are tables, {@code false} otherwise
|
||||
*/
|
||||
public boolean hasTables() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny().isPresent();
|
||||
|
||||
@ -267,7 +267,7 @@ public class Table implements SemanticNode {
|
||||
* @param values List of strings to check cells with provided header
|
||||
* @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
|
||||
*/
|
||||
public boolean hasRowWithHeaderAndAnyValue(String header, List<String> values) {
|
||||
public boolean hasRowWithHeaderAndAnyValue(String header, String... values) {
|
||||
|
||||
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values));
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user