proompting

This commit is contained in:
Kilian Schuettler 2023-07-10 13:47:25 +02:00
parent 12fcc6ca6d
commit b2621fb4b3
7 changed files with 693 additions and 1 deletions

18
drools-prompt/Page_doc Normal file
View File

@ -0,0 +1,18 @@
/**
* Retrieves the main body text block.
*
* @return The text block representing the main body of the document.
*/
public TextBlock getMainBodyTextBlock()
/**
* Gets all Entities located on the page
*
* @return Set of all Entities associated with this Page
*/
Set<RedactionEntity> getEntities();
/**
* Returns the Page Number
*
* @return The number of this page
*/
Integer getPageNumber();

View File

@ -0,0 +1,6 @@
/**
* Determines whether this Section has any tables.
*
* @return {@code true} if there are tables, {@code false} otherwise
*/
public boolean hasTables()

View File

@ -0,0 +1,206 @@
/**
* Returns the type of this node, such as NodeType.SECTION, NodeType.PARAGRAPH, etc.
*
* @return NodeType of this node
*/
NodeType getType();
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose boundary intersects the boundary of this node.
* The Entities might overlap with the Entities in other Sets
*
* @return Set of all Entities associated with this Node
*/
Set<RedactionEntity> getEntities();
/**
* Returns all Pages this SemanticNode is associated with.
*
* @return Set of Pages this node appears on.
*/
Set<Page> getPages()
/**
* Checks if this node appears on the specified page number.
*
* @param pageNumber The page number to check.
* @return True if this node is found on the specified page number, false otherwise.
*/
boolean isOnPage(int pageNumber)
/**
* Returns the closest Headline associated with this SemanticNode
*
* @return First Headline found.
*/
Headline getHeadline()
/**
* @return The SemanticNode representing the Parent in the DocumentTree
* throws NotFoundException, when no parent is present
*/
SemanticNode getParent()
/**
* Checks whether this SemanticNode has any Entity of the provided type.
* Ignores Entity with ignored == true or removed == true.
*
* @param type string representing the type of entity to check for
* @return true, if this SemanticNode has at least one Entity of the provided type
*/
boolean hasEntitiesOfType(String type)
/**
* Checks whether this SemanticNode has any Entity of the provided types.
* Ignores Entity with ignored == true or removed == true.
*
* @param types an array of strings representing the types of entities to check for
* @return true, if this SemanticNode has at least one Entity of any of the provided types
*/
boolean hasEntitiesOfAnyType(String... types)
/**
* Checks whether this SemanticNode has at least one Entity of each of the provided types.
* Ignores Entity with ignored == true or removed == true.
*
* @param types an array of strings representing the types of entities to check for
* @return true, if this SemanticNode has at least one Entity of each of the provided types
*/
boolean hasEntitiesOfAllTypes(String... types)
/**
* Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author".
* Ignores Entity with ignored == true or removed == true.
*
* @param type string representing the type of entities to return
* @return List of RedactionEntities of any the type
*/
List<RedactionEntity> getEntitiesOfType(String type)
/**
* Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author".
* Ignores Entity with ignored == true or removed == true.
*
* @param types A list of strings representing the types of entities to return
* @return List of RedactionEntities of any provided type
*/
List<RedactionEntity> getEntitiesOfType(List<String> types)
/**
* Returns a List of Entities in this SemanticNode which have any of the provided types.
* Ignores Entity with the ignored flag set to true or the removed flag set to true.
*
* @param types A list of strings representing the types of entities to return
* @return List of RedactionEntities that match any of the provided types
*/
List<RedactionEntity> getEntitiesOfType(String... types)
/**
* Checks whether this SemanticNode contains the provided String.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string
*/
boolean containsString(String string)
/**
* Checks whether this SemanticNode contains all the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all strings
*/
boolean containsAllStrings(String... strings)
/**
* Checks whether this SemanticNode contains any of the provided Strings.
*
* @param strings A List of Strings to check if they are contained in the TextBlock
* @return true, if this node's TextBlock contains any of the provided strings
*/
boolean containsAnyString(String... strings)
/**
* Checks whether this SemanticNode contains all the provided Strings ignoring case.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string ignoring case
*/
boolean containsStringIgnoreCase(String string)
/**
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
boolean containsAnyStringIgnoreCase(String... strings)
/**
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
boolean containsAllStringsIgnoreCase(String... strings)
/**
* Checks whether this SemanticNode matches the provided regex pattern.
*
* @param regexPattern A String representing a regex pattern, which the TextBlock might contain
* @return true, if this node's TextBlock contains the regex pattern
*/
boolean matchesRegex(String regexPattern)
/**
* Checks whether this SemanticNode matches the provided regex pattern ignoring case.
*
* @param regexPattern A String representing a regex pattern, which the TextBlock might contain
* @return true, if this node's TextBlock contains the regex pattern ignoring case
*/
boolean matchesRegexIgnoreCase(String regexPattern)
/**
* Streams all children located directly underneath this node in the DocumentTree.
*
* @return Stream of all children
*/
Stream<SemanticNode> streamChildren()
/**
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
*
* @param nodeType the type of nodes to stream
* @return Stream of all children of the provided type
*/
Stream<SemanticNode> streamChildrenOfType(NodeType nodeType)
/**
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
Stream<SemanticNode> streamAllSubNodes()
/**
* Recursively streams all SemanticNodes of a specified type located underneath this node in the DocumentTree in order.
*
* @param nodeType the type of nodes to be streamed
* @return a Stream of all SubNodes of the specified type
*/
Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType)
/**
* The Boundary is the start and end string offsets in the reading order of the document.
*
* @return Boundary of this Node's TextBlock
*/
Boundary getBoundary()
/**
* The SectionIdentifier uses the numeric identifiers of Headlines to infer a tree structure.
* It implements functions such as sectionIdentifier.isChildOf(otherSectionIdentifier) and sectionIdentifier.isParentOf(otherSectionIdentifier)
*
* @return The SectionIdentifier from the first Headline.
*/
SectionIdentifier getSectionIdentifier()

86
drools-prompt/Table_doc Normal file
View File

@ -0,0 +1,86 @@
/**
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
*
* @param strings Strings to check whether a row contains them
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
*/
Stream<RedactionEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings)
/**
* Checks whether the specified row contains all the provided strings.
*
* @param row the row to check as an Integer, must be smaller than numberOfRows
* @param strings a list of strings to check for
* @return true, if all strings appear in the provided row
*/
boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings)
/**
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
*
* @param header the header value to search for
* @param value the string which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
*/
Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value)
/**
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
*
* @param header the header value to search for
* @param values the strings which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
*/
Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values)
/**
* Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
* Ignores Entity with ignored == true or removed == true.
*
* @param types type strings to check whether a row contains an entity like them
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*/
Stream<RedactionEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types)
/**
* Streams all entities in this table, that appear in a row, which contains no entity of any of the provided types.
* Ignores Entity with ignored == true or removed == true.
*
* @param types type strings to check whether a row contains an entity like them
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*/
Stream<RedactionEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types)
/**
* Streams all TableCells in this Table which have the provided header row-wise.
*
* @return Stream of all TableCells which have the provided header
*/
Stream<TableCell> streamTableCellsWithHeader(String header)
/**
* Streams all Headers and checks if any equal the provided string.
*
* @param header string to check the headers for
* @return true, if at least one header equals the provided string
*/
boolean hasHeader(String header)
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
*
* @param header string to find header cells
* @param value string to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
*/
boolean hasRowWithHeaderAndValue(String header, String value)
/**
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
* Ignores Entity with ignored == true or removed == true.
*
* @param type the type of entities to search for
* @param redactionEntity the entity, which appears in the row to search
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
*/
List<RedactionEntity> getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity)

371
drools-prompt/drools-prompt Normal file
View File

@ -0,0 +1,371 @@
From now on, you are a Drools rule generator.
You have a Document data structure written in Java with the following objects:
- Section
- Table
- TableCell
- Paragraph
- Headline
- Page
- RedactionEntity
- EntityCreationService
The Section, Table, TableCell, Paragraph, and Headline implement a common interface called SemanticNode. SemanticNodes are arranged in a tree-like fashion, where any SemanticNode can have multiple SemanticNodes as children. The arrangement is as follows:
- Tables only have TableCells as children.
- TableCells may have any child, except TableCells.
- Paragraphs and Headlines have no children.
- Sections may have any child except TableCells, but if it contains Paragraphs as well as Tables, it is split into a Section with multiple Sections as children, where any child Section only contains either Tables or Paragraphs.
The first Headline remains in the Parent Section, while all others are put into the child section they belong to.
----------------------------------------------------------------
The relevant functions for SemanticNode:
/**
* Returns the type of this node, such as NodeType.SECTION, NodeType.PARAGRAPH, etc.
*
* @return NodeType of this node
*/
NodeType getType();
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose boundary intersects the boundary of this node.
* The Entities might overlap with the Entities in other Sets
*
* @return Set of all Entities associated with this Node
*/
Set<RedactionEntity> getEntities();
/**
* Returns all Pages this SemanticNode is associated with.
*
* @return Set of Pages this node appears on.
*/
Set<Page> getPages()
/**
* Checks if this node appears on the specified page number.
*
* @param pageNumber The page number to check.
* @return True if this node is found on the specified page number, false otherwise.
*/
boolean isOnPage(int pageNumber)
/**
* Returns the closest Headline associated with this SemanticNode
*
* @return First Headline found.
*/
Headline getHeadline()
/**
* @return The SemanticNode representing the Parent in the DocumentTree
* throws NotFoundException, when no parent is present
*/
SemanticNode getParent()
/**
* Checks whether this SemanticNode has any Entity of the provided type.
* Ignores Entity with ignored == true or removed == true.
*
* @param type string representing the type of entity to check for
* @return true, if this SemanticNode has at least one Entity of the provided type
*/
boolean hasEntitiesOfType(String type)
/**
* Checks whether this SemanticNode has any Entity of the provided types.
* Ignores Entity with ignored == true or removed == true.
*
* @param types an array of strings representing the types of entities to check for
* @return true, if this SemanticNode has at least one Entity of any of the provided types
*/
boolean hasEntitiesOfAnyType(String... types)
/**
* Checks whether this SemanticNode has at least one Entity of each of the provided types.
* Ignores Entity with ignored == true or removed == true.
*
* @param types an array of strings representing the types of entities to check for
* @return true, if this SemanticNode has at least one Entity of each of the provided types
*/
boolean hasEntitiesOfAllTypes(String... types)
/**
* Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author".
* Ignores Entity with ignored == true or removed == true.
*
* @param type string representing the type of entities to return
* @return List of RedactionEntities of any the type
*/
List<RedactionEntity> getEntitiesOfType(String type)
/**
* Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author".
* Ignores Entity with ignored == true or removed == true.
*
* @param types A list of strings representing the types of entities to return
* @return List of RedactionEntities of any provided type
*/
List<RedactionEntity> getEntitiesOfType(List<String> types)
/**
* Returns a List of Entities in this SemanticNode which have any of the provided types.
* Ignores Entity with the ignored flag set to true or the removed flag set to true.
*
* @param types A list of strings representing the types of entities to return
* @return List of RedactionEntities that match any of the provided types
*/
List<RedactionEntity> getEntitiesOfType(String... types)
/**
* Checks whether this SemanticNode contains the provided String.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string
*/
boolean containsString(String string)
/**
* Checks whether this SemanticNode contains all the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all strings
*/
boolean containsAllStrings(String... strings)
/**
* Checks whether this SemanticNode contains any of the provided Strings.
*
* @param strings A List of Strings to check if they are contained in the TextBlock
* @return true, if this node's TextBlock contains any of the provided strings
*/
boolean containsAnyString(String... strings)
/**
* Checks whether this SemanticNode contains all the provided Strings ignoring case.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string ignoring case
*/
boolean containsStringIgnoreCase(String string)
/**
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
boolean containsAnyStringIgnoreCase(String... strings)
/**
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
boolean containsAllStringsIgnoreCase(String... strings)
/**
* Checks whether this SemanticNode matches the provided regex pattern.
*
* @param regexPattern A String representing a regex pattern, which the TextBlock might contain
* @return true, if this node's TextBlock contains the regex pattern
*/
boolean matchesRegex(String regexPattern)
/**
* Checks whether this SemanticNode matches the provided regex pattern ignoring case.
*
* @param regexPattern A String representing a regex pattern, which the TextBlock might contain
* @return true, if this node's TextBlock contains the regex pattern ignoring case
*/
boolean matchesRegexIgnoreCase(String regexPattern)
/**
* Streams all children located directly underneath this node in the DocumentTree.
*
* @return Stream of all children
*/
Stream<SemanticNode> streamChildren()
/**
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
*
* @param nodeType the type of nodes to stream
* @return Stream of all children of the provided type
*/
Stream<SemanticNode> streamChildrenOfType(NodeType nodeType)
/**
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
Stream<SemanticNode> streamAllSubNodes()
/**
* Recursively streams all SemanticNodes of a specified type located underneath this node in the DocumentTree in order.
*
* @param nodeType the type of nodes to be streamed
* @return a Stream of all SubNodes of the specified type
*/
Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType)
/**
* The Boundary is the start and end string offsets in the reading order of the document.
*
* @return Boundary of this Node's TextBlock
*/
Boundary getBoundary()
/**
* The SectionIdentifier uses the numeric identifiers of Headlines to infer a tree structure.
* It implements functions such as sectionIdentifier.isChildOf(otherSectionIdentifier) and sectionIdentifier.isParentOf(otherSectionIdentifier)
*
* @return The SectionIdentifier from the first Headline.
*/
SectionIdentifier getSectionIdentifier()
----------------------------------------------------------------
TheTable has the additional functions:
/**
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
*
* @param strings Strings to check whether a row contains them
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
*/
Stream<RedactionEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings)
/**
* Checks whether the specified row contains all the provided strings.
*
* @param row the row to check as an Integer, must be smaller than numberOfRows
* @param strings a list of strings to check for
* @return true, if all strings appear in the provided row
*/
boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings)
/**
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
*
* @param header the header value to search for
* @param value the string which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
*/
Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value)
/**
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
*
* @param header the header value to search for
* @param values the strings which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
*/
Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values)
/**
* Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
* Ignores Entity with ignored == true or removed == true.
*
* @param types type strings to check whether a row contains an entity like them
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*/
Stream<RedactionEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types)
/**
* Streams all entities in this table, that appear in a row, which contains no entity of any of the provided types.
* Ignores Entity with ignored == true or removed == true.
*
* @param types type strings to check whether a row contains an entity like them
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*/
Stream<RedactionEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types)
/**
* Streams all TableCells in this Table which have the provided header row-wise.
*
* @return Stream of all TableCells which have the provided header
*/
Stream<TableCell> streamTableCellsWithHeader(String header)
/**
* Streams all Headers and checks if any equal the provided string.
*
* @param header string to check the headers for
* @return true, if at least one header equals the provided string
*/
boolean hasHeader(String header)
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
*
* @param header string to find header cells
* @param value string to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
*/
boolean hasRowWithHeaderAndValue(String header, String value)
/**
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
* Ignores Entity with ignored == true or removed == true.
*
* @param type the type of entities to search for
* @param redactionEntity the entity, which appears in the row to search
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
*/
List<RedactionEntity> getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity)
----------------------------------------------------------------
The Section has these additional Rules:
/**
* Determines whether this Section has any tables.
*
* @return {@code true} if there are tables, {@code false} otherwise
*/
boolean hasTables()
----------------------------------------------------------------
The Page Object has the following functions:
/**
* Retrieves the main body text block.
* @return The text block representing the main body of the document.
*/
public TextBlock getMainBodyTextBlock()
/**
* Gets all Entities located on the page
* @return Set of all Entities associated with this Page
*/
Set<RedactionEntity> getEntities();
/**
* Returns the Page Number
*
* @return The number of this page
*/
Integer getPageNumber();
----------------------------------------------------------------
The goal of the Rules is to find pieces of Text that we want to redact.
There are two different types of rules, during one you create new Entities and in the other you change or remove existing Entities.
An Entity is any piece of text, uniquely identified in the Document by its Boundary, its Type and its EntityType. The Boundary consists of a start and stop index in the text of the document.
The Type is a String like "PII", which stands for
The goal is to find entities that fulfill certain conditions. Each SemanticNode has its own set of entities, but these sets may have intersections.
For example, a Section contains all the entities in any of its children. Additionally, if an entity overlaps two SemanticNodes, both paragraphs have this entity in their sets.
To generate Drools rules for the scenario of changing or updating Entities, consider the following information:
1. Conditions: Specify the conditions that must be met for an entity to be selected. For example:
- The entity has a specific attribute value.
- The entity is within a certain range of values.
- The entity satisfies a complex combination of conditions.
2. Actions: Define the actions to be performed when an entity fulfills the conditions. This could include:
- Adding the entity to a result set.
- Modifying the entity's attributes.
- Triggering some other behavior or logic.
3. Rule Structure: Determine the structure of the Drools rules. This typically consists of:
- Rule names: Choose meaningful names for your rules.
- Rule attributes: Set the salience (priority) of rules if necessary.
- Conditions: Define the conditions based on the requirements.
- Actions: Specify the actions to be performed when the conditions are met.
Remember to provide specific examples, use case scenarios, and any additional requirements you have for the Drools rules.
Please provide any specific conditions, actions, or examples that you would like to be incorporated into the Drools rules.

View File

@ -42,6 +42,11 @@ public class Section implements GenericSemanticNode {
}
/**
* Determines whether this Section has any tables.
*
* @return {@code true} if there are tables, {@code false} otherwise
*/
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny().isPresent();

View File

@ -267,7 +267,7 @@ public class Table implements SemanticNode {
* @param values List of strings to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*/
public boolean hasRowWithHeaderAndAnyValue(String header, List<String> values) {
public boolean hasRowWithHeaderAndAnyValue(String header, String... values) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values));
}