hotfix: Tried parent child

This commit is contained in:
deiflaender 2023-04-11 16:04:19 +02:00
parent c8f1f1255e
commit 17aaf2e639
13 changed files with 141893 additions and 284 deletions

View File

@ -0,0 +1,18 @@
package com.iqser.red.service.search.v1.server.model;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor(access = AccessLevel.PRIVATE)
@AllArgsConstructor(access = AccessLevel.PRIVATE)
@SuppressWarnings("serial")
public class ChildObject {
private IndexSection section;
private JoinField joinType;
}

View File

@ -11,6 +11,6 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class Embeddings {
private Map<String, Float[]> embeddings;
private Map<String, Double[]> embeddings;
}

View File

@ -2,9 +2,7 @@ package com.iqser.red.service.search.v1.server.model;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Set;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -19,18 +17,10 @@ import lombok.NoArgsConstructor;
@SuppressWarnings("serial")
public class IndexDocument implements Serializable {
private String fileId;
private String filename;
private int sectionNumber;
private String text;
private Set<Integer> pages;
private String headline;
@Builder.Default
private List<IndexFileAttribute> fileAttributes = new ArrayList<>();
private Float[] wordEmbeddingsVector = new Float[383];
}

View File

@ -21,4 +21,6 @@ public class IndexSection implements Serializable {
private Set<Integer> pages;
private String headline;
private Double[] wordEmbeddingsVector = new Double[383];
}

View File

@ -0,0 +1,18 @@
package com.iqser.red.service.search.v1.server.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class JoinField {
@Builder.Default
private String name = "section";
private String parent;
}

View File

@ -0,0 +1,21 @@
package com.iqser.red.service.search.v1.server.model;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor(access = AccessLevel.PRIVATE)
@AllArgsConstructor(access = AccessLevel.PRIVATE)
@SuppressWarnings("serial")
public class ParentObject {
private IndexDocument document;
@Builder.Default
private String joinType = "document";
}

View File

@ -10,21 +10,29 @@ import org.springframework.stereotype.Service;
import com.iqser.red.service.search.v1.server.model.IndexDocument;
import com.iqser.red.service.search.v1.server.model.IndexDocumentUpdate;
import com.iqser.red.service.search.v1.server.model.IndexFileAttribute;
import com.iqser.red.service.search.v1.server.model.IndexSection;
@Service
public class IndexDocumentConverterService {
public IndexDocument convert(String fileId, String filename, int sectionNr, Set<Integer> pages, String headline, String text, Map<String, String> fileAttributes, Float[] embeddingsVector) {
public IndexDocument convert(String fileId, String filename, Map<String, String> fileAttributes) {
return IndexDocument.builder()
.fileId(fileId)
.filename(filename)
.sectionNumber(sectionNr)
.fileAttributes(convertFileAttributes(fileAttributes))
.build();
}
public IndexSection convert(int sectionNumber, String text, Set<Integer> pages, String headline, Double[] wordEmbeddingsVector) {
return IndexSection.builder()
.sectionNumber(sectionNumber)
.text(text)
.pages(pages)
.headline(headline)
.text(text)
.fileAttributes(convertFileAttributes(fileAttributes))
.wordEmbeddingsVector(embeddingsVector)
.wordEmbeddingsVector(wordEmbeddingsVector)
.build();
}

View File

@ -6,7 +6,11 @@ import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
import com.iqser.red.service.search.v1.server.exception.IndexException;
import com.iqser.red.service.search.v1.server.model.ChildObject;
import com.iqser.red.service.search.v1.server.model.IndexDocument;
import com.iqser.red.service.search.v1.server.model.IndexSection;
import com.iqser.red.service.search.v1.server.model.JoinField;
import com.iqser.red.service.search.v1.server.model.ParentObject;
import com.iqser.red.service.search.v1.server.multitenancy.TenantContext;
import com.iqser.red.service.search.v1.server.service.DocumentIndexService;
import com.iqser.red.service.search.v1.server.settings.ElasticsearchSettings;
@ -33,12 +37,26 @@ public class DocumentIndexServiceImpl implements DocumentIndexService {
try {
clientCache.getClient()
.index(i -> i.index(TenantContext.getTenantId())
.id(indexDocument.getFileId()+"_"+indexDocument.getSectionNumber())
.id(indexDocument.getFileId())
.refresh(Refresh._DESERIALIZER.parse(settings.getRefreshPolicy()))
.document(indexDocument));
.document(ParentObject.builder().document(indexDocument).build()));
} catch (IOException | ElasticsearchException e) {
throw IndexException.documentIndexError(indexDocument.getFileId(), e);
}
}
public void indexSection(String parentFileId, IndexSection indexSection){
try {
clientCache.getClient()
.index(i -> i.index(TenantContext.getTenantId())
.id(parentFileId + "_" + indexSection.getSectionNumber())
.routing(parentFileId)
.refresh(Refresh._DESERIALIZER.parse(settings.getRefreshPolicy()))
.document(ChildObject.builder().section(indexSection).joinType(JoinField.builder().parent(parentFileId).build()).build()));
} catch (IOException | ElasticsearchException e) {
throw IndexException.documentIndexError(parentFileId, e);
}
}
}

View File

@ -1,44 +1,58 @@
{
"properties": {
"fileId": {
"type": "keyword"
},
"filename": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "filename_analyzer"
},
"sectionNumber": {
"type": "keyword"
},
"fileAttributes": {
"type": "nested",
"include_in_parent": true,
"document": {
"properties": {
"name": {
"fileId": {
"type": "keyword"
},
"value": {
"filename": {
"type": "text",
"term_vector": "with_positions_offsets"
"term_vector": "with_positions_offsets",
"analyzer": "filename_analyzer"
},
"fileAttributes": {
"type": "nested",
"include_in_parent": true,
"properties": {
"name": {
"type": "keyword"
},
"value": {
"type": "text",
"term_vector": "with_positions_offsets"
}
}
}
}
},
"headline": {
"type": "text"
"section": {
"properties": {
"sectionNumber": {
"type": "keyword"
},
"headline": {
"type": "text"
},
"pages": {
"type": "keyword"
},
"text": {
"type": "text",
"term_vector": "with_positions_offsets"
},
"wordEmbeddingsVector": {
"type": "dense_vector",
"dims": 384,
"index": true,
"similarity": "cosine"
}
}
},
"pages": {
"type": "keyword"
},
"text": {
"type": "text",
"term_vector": "with_positions_offsets"
},
"wordEmbeddingsVector": {
"type": "dense_vector",
"dims": 384,
"index": true,
"similarity": "cosine"
"joinType": {
"type": "join",
"relations": {
"document": "section"
}
}
}
}

View File

@ -19,11 +19,14 @@ import com.iqser.red.service.search.v1.server.client.FileStatusProcessingUpdateC
import com.iqser.red.service.search.v1.server.client.IndexInformationClient;
import com.iqser.red.service.search.v1.server.model.Embeddings;
import com.iqser.red.service.search.v1.server.model.IndexDocument;
import com.iqser.red.service.search.v1.server.model.IndexSection;
import com.iqser.red.service.search.v1.server.model.SectionText;
import com.iqser.red.service.search.v1.server.model.Text;
import com.iqser.red.service.search.v1.server.multitenancy.TenantContext;
import com.iqser.red.service.search.v1.server.service.elasticsearch.DocumentIndexServiceImpl;
import com.iqser.red.service.search.v1.server.service.elasticsearch.EsClientCache;
import co.elastic.clients.elasticsearch._types.KnnQuery;
import co.elastic.clients.elasticsearch._types.query_dsl.QueryBuilders;
import co.elastic.clients.elasticsearch.core.KnnSearchRequest;
import co.elastic.clients.elasticsearch.core.SearchRequest;
@ -39,7 +42,7 @@ public class ElasticsearchTest extends AbstractElasticsearchIntegrationTest {
private ObjectMapper objectMapper;
@Autowired
private DocumentIndexService documentIndexService;
private DocumentIndexServiceImpl documentIndexService;
@Autowired
@ -77,13 +80,283 @@ public class ElasticsearchTest extends AbstractElasticsearchIntegrationTest {
private final long UPDATE_TIMER = 1500;
// See https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html
// @Test
// @SneakyThrows
// public void testWordEmbeddingsScript() {
//
// ClassPathResource textResource = new ClassPathResource("files/1-TEXT.json");
// Text text1 = objectMapper.readValue(textResource.getInputStream(), Text.class);
//
// ClassPathResource embeddingsResource = new ClassPathResource("files/pdf-1.EMBEDDINGS.json");
// Embeddings embeddings1 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
//
// int size = text1.getSectionTexts().size();
// int i = 1;
// for (SectionText sectionText : text1.getSectionTexts()) {
// indexSection("1-TEXT",
// "1-TEXT.pdf",
// sectionText.getSectionNumber(),
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
// sectionText.getHeadline(),
// sectionText.getText(),
// null,
// embeddings1.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
// System.out.println("Index section " + i + "of" + size + "for document 1");
// i++;
// }
//
// textResource = new ClassPathResource("files/2-TEXT.json");
// var text2 = objectMapper.readValue(textResource.getInputStream(), Text.class);
//
// embeddingsResource = new ClassPathResource("files/pdf-2.EMBEDDINGS.json");
// var embeddings2 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
//
// size = text1.getSectionTexts().size();
// i = 1;
// for (SectionText sectionText : text2.getSectionTexts()) {
// indexSection("2-TEXT",
// "2-TEXT.pdf",
// sectionText.getSectionNumber(),
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
// sectionText.getHeadline(),
// sectionText.getText(),
// null,
// embeddings2.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
// System.out.println("Index section " + i + "of" + size + "for document 2");
// i++;
// }
//
// textResource = new ClassPathResource("files/3-TEXT.json");
// var text3 = objectMapper.readValue(textResource.getInputStream(), Text.class);
//
// embeddingsResource = new ClassPathResource("files/pdf-3.EMBEDDINGS.json");
// var embeddings3 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
//
// size = text1.getSectionTexts().size();
// i = 1;
// for (SectionText sectionText : text3.getSectionTexts()) {
// indexSection("3-TEXT",
// "3-TEXT.pdf",
// sectionText.getSectionNumber(),
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
// sectionText.getHeadline(),
// sectionText.getText(),
// null,
// embeddings3.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
// System.out.println("Index section " + i + "of" + size + "for document 3");
// i++;
// }
//
//
// Thread.sleep(240000);
// System.out.println("Finished wait");
//
// for (Map.Entry<String, Float[]> vector : embeddings1.getEmbeddings().entrySet()) {
// System.out.println("Document 1: section: " + vector.getKey());
// queryWithScript(vector.getValue());
//
// }
//
//
// for (Map.Entry<String, Float[]> vector : embeddings2.getEmbeddings().entrySet()) {
// System.out.println("Document 2: section: " + vector.getKey());
// queryWithScript(vector.getValue());
//
// }
//
//
// for (Map.Entry<String, Float[]> vector : embeddings3.getEmbeddings().entrySet()) {
// System.out.println("Document 3: section: " + vector.getKey());
// queryWithScript(vector.getValue());
//
// }
//
//
//
// }
//
//
// @SneakyThrows
// private void queryWithScript(Float[] embeddingsVector){
// final String vector = objectMapper.writeValueAsString(embeddingsVector);
//
//
// var query = QueryBuilders.scriptScore(s -> s.query(QueryBuilders.matchAll().build()._toQuery()).script(si -> si.inline(i -> i.source("""
// cosineSimilarity(params.query_vector, 'wordEmbeddingsVector') + 1.0
// """).params("query_vector", JsonData.fromJson(vector)))));
//
//
//
// SearchRequest request = new SearchRequest.Builder().query(query)
// .from(0)
// .size(50)
// .trackScores(true)
// .build();
//
//
// SearchResponse response = clientCache.getClient().search(request, IndexDocument.class);
//
// response.hits().hits().stream().forEach(hit -> {
// var h = (Hit) hit;
// System.out.println("Id: " + h.id() + " Score:" + h.score());
// });
//
// }
//
//
//
//
// @Test
// @SneakyThrows
// public void testWordEmbeddingsKnn() {
//
// ClassPathResource textResource = new ClassPathResource("files/1-TEXT.json");
// Text text1 = objectMapper.readValue(textResource.getInputStream(), Text.class);
//
// ClassPathResource embeddingsResource = new ClassPathResource("files/pdf-1.EMBEDDINGS.json");
// Embeddings embeddings1 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
//
// int size = text1.getSectionTexts().size();
// int i = 1;
// for (SectionText sectionText : text1.getSectionTexts()) {
// indexSection("1-TEXT",
// "1-TEXT.pdf",
// sectionText.getSectionNumber(),
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
// sectionText.getHeadline(),
// sectionText.getText(),
// null,
// embeddings1.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
// System.out.println("Index section " + i + "of" + size + "for document 1");
// i++;
// }
//
// textResource = new ClassPathResource("files/2-TEXT.json");
// var text2 = objectMapper.readValue(textResource.getInputStream(), Text.class);
//
// embeddingsResource = new ClassPathResource("files/pdf-2.EMBEDDINGS.json");
// var embeddings2 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
//
// size = text1.getSectionTexts().size();
// i = 1;
// for (SectionText sectionText : text2.getSectionTexts()) {
// indexSection("2-TEXT",
// "2-TEXT.pdf",
// sectionText.getSectionNumber(),
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
// sectionText.getHeadline(),
// sectionText.getText(),
// null,
// embeddings2.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
// System.out.println("Index section " + i + "of" + size + "for document 2");
// i++;
// }
//
// textResource = new ClassPathResource("files/3-TEXT.json");
// var text3 = objectMapper.readValue(textResource.getInputStream(), Text.class);
//
// embeddingsResource = new ClassPathResource("files/pdf-3.EMBEDDINGS.json");
// var embeddings3 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
//
// size = text1.getSectionTexts().size();
// i = 1;
// for (SectionText sectionText : text3.getSectionTexts()) {
// indexSection("3-TEXT",
// "3-TEXT.pdf",
// sectionText.getSectionNumber(),
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
// sectionText.getHeadline(),
// sectionText.getText(),
// null,
// embeddings3.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
// System.out.println("Index section " + i + "of" + size + "for document 3");
// i++;
// }
//
//
// Thread.sleep(1000);
// System.out.println("Finished wait");
//
// for (Map.Entry<String, Float[]> vector : embeddings1.getEmbeddings().entrySet()) {
// System.out.println("Document 1: section: " + vector.getKey());
// List<Double> vectorAsList = new ArrayList<>();
// for (Float f : vector.getValue()) {
// vectorAsList.add((double) f);
// }
//
// KnnQuery k = new KnnQuery.Builder().k(10).numCandidates(100).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
// SearchRequest sr = new SearchRequest.Builder().knn(k)
// .from(0)
// .size(50)
// .trackScores(true)
// .build();
//
// var resp = clientCache.getClient().search(sr, IndexDocument.class);
//
// resp.hits().hits().forEach(hit -> {
// System.out.println("Id: " + hit.id() + " Score:" + hit.score());
// });
//
// }
//
//
// for (Map.Entry<String, Float[]> vector : embeddings2.getEmbeddings().entrySet()) {
// System.out.println("Document 2: section: " + vector.getKey());
// List<Double> vectorAsList = new ArrayList<>();
// for (Float f : vector.getValue()) {
// vectorAsList.add((double) f);
// }
// KnnQuery k = new KnnQuery.Builder().k(10).numCandidates(100).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
// SearchRequest sr = new SearchRequest.Builder().knn(k)
// .from(0)
// .size(50)
// .trackScores(true)
// .build();
//
// var resp = clientCache.getClient().search(sr, IndexDocument.class);
//
// resp.hits().hits().forEach(hit -> {
// System.out.println("Id: " + hit.id() + " Score:" + hit.score());
// });
//
// }
//
//
// for (Map.Entry<String, Float[]> vector : embeddings3.getEmbeddings().entrySet()) {
// System.out.println("Document 3: section: " + vector.getKey());
// List<Double> vectorAsList = new ArrayList<>();
// for (Float f : vector.getValue()) {
// vectorAsList.add((double) f);
// }
// KnnQuery k = new KnnQuery.Builder().k(10).numCandidates(100).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
// SearchRequest sr = new SearchRequest.Builder().knn(k)
// .from(0)
// .size(50)
// .trackScores(true)
// .build();
//
// var resp = clientCache.getClient().search(sr, IndexDocument.class);
//
// resp.hits().hits().forEach(hit -> {
// System.out.println("Id: " + hit.id() + " Score:" + hit.score());
// });
//
// }
//
//
//
// }
@Test
@SneakyThrows
public void testWordEmbeddingsScript() {
public void testParentChild(){
ClassPathResource textResource = new ClassPathResource("files/1-TEXT.json");
Text text1 = objectMapper.readValue(textResource.getInputStream(), Text.class);
@ -91,262 +364,55 @@ public class ElasticsearchTest extends AbstractElasticsearchIntegrationTest {
ClassPathResource embeddingsResource = new ClassPathResource("files/pdf-1.EMBEDDINGS.json");
Embeddings embeddings1 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
int size = text1.getSectionTexts().size();
int i = 1;
for (SectionText sectionText : text1.getSectionTexts()) {
indexSection("1-TEXT",
"1-TEXT.pdf",
sectionText.getSectionNumber(),
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
sectionText.getHeadline(),
sectionText.getText(),
null,
embeddings1.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
System.out.println("Index section " + i + "of" + size + "for document 1");
i++;
}
textResource = new ClassPathResource("files/2-TEXT.json");
var text2 = objectMapper.readValue(textResource.getInputStream(), Text.class);
embeddingsResource = new ClassPathResource("files/pdf-2.EMBEDDINGS.json");
var embeddings2 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
size = text1.getSectionTexts().size();
i = 1;
for (SectionText sectionText : text2.getSectionTexts()) {
indexSection("2-TEXT",
"2-TEXT.pdf",
sectionText.getSectionNumber(),
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
sectionText.getHeadline(),
sectionText.getText(),
null,
embeddings2.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
System.out.println("Index section " + i + "of" + size + "for document 2");
i++;
}
textResource = new ClassPathResource("files/3-TEXT.json");
var text3 = objectMapper.readValue(textResource.getInputStream(), Text.class);
embeddingsResource = new ClassPathResource("files/pdf-3.EMBEDDINGS.json");
var embeddings3 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
size = text1.getSectionTexts().size();
i = 1;
for (SectionText sectionText : text3.getSectionTexts()) {
indexSection("3-TEXT",
"3-TEXT.pdf",
sectionText.getSectionNumber(),
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
sectionText.getHeadline(),
sectionText.getText(),
null,
embeddings3.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
System.out.println("Index section " + i + "of" + size + "for document 3");
i++;
}
indexDocument("fileId1", "FirstFile.pdf", Map.of("dossierTemplateId", "dossierTemplateId1", "dossierId", "dossierId2"));
indexSection("fileId1", 1, text1.getSectionTexts().get(0).getText(),Set.of(1), text1.getSectionTexts().get(0).getHeadline(), embeddings1.getEmbeddings().get("1"));
indexSection("fileId1", 1, text1.getSectionTexts().get(1).getText(),Set.of(1), text1.getSectionTexts().get(0).getHeadline(), embeddings1.getEmbeddings().get("2"));
indexSection("fileId1", 1, text1.getSectionTexts().get(2).getText(),Set.of(1), text1.getSectionTexts().get(0).getHeadline(), embeddings1.getEmbeddings().get("3"));
Thread.sleep(1000);
System.out.println("Finished wait");
var query = QueryBuilders.matchPhrasePrefix(a -> a.query("Abamectin comments").field("section.text"));
for (Map.Entry<String, Float[]> vector : embeddings1.getEmbeddings().entrySet()) {
System.out.println("Document 1: section: " + vector.getKey());
queryWithScript(vector.getValue());
var fq = QueryBuilders.matchPhrasePrefix(a -> a.query("FirstFile.pdf").field("document.filename"));
}
var hp = QueryBuilders.hasParent(a -> a.parentType("document").query(fq));
for (Map.Entry<String, Float[]> vector : embeddings2.getEmbeddings().entrySet()) {
System.out.println("Document 2: section: " + vector.getKey());
queryWithScript(vector.getValue());
}
for (Map.Entry<String, Float[]> vector : embeddings3.getEmbeddings().entrySet()) {
System.out.println("Document 3: section: " + vector.getKey());
queryWithScript(vector.getValue());
}
var parentFiltered = QueryBuilders.bool(a -> a.must(query).filter(hp));
}
SearchRequest sr = new SearchRequest.Builder().query(parentFiltered)
.from(0)
.size(50)
.trackScores(true)
.build();
@SneakyThrows
private void queryWithScript(Float[] embeddingsVector){
final String vector = objectMapper.writeValueAsString(embeddingsVector);
var query = QueryBuilders.scriptScore(s -> s.query(QueryBuilders.matchAll().build()._toQuery()).script(si -> si.inline(i -> i.source("""
cosineSimilarity(params.query_vector, 'wordEmbeddingsVector') + 1.0
""").params("query_vector", JsonData.fromJson(vector)))));
SearchRequest request = new SearchRequest.Builder().query(query)
.from(0)
.size(50)
.trackScores(true)
.build();
SearchResponse response = clientCache.getClient().search(request, IndexDocument.class);
response.hits().hits().stream().forEach(hit -> {
var h = (Hit) hit;
System.out.println("Id: " + h.id() + " Score:" + h.score());
});
}
@Test
@SneakyThrows
public void testWordEmbeddingsKnn() {
ClassPathResource textResource = new ClassPathResource("files/1-TEXT.json");
Text text1 = objectMapper.readValue(textResource.getInputStream(), Text.class);
ClassPathResource embeddingsResource = new ClassPathResource("files/pdf-1.EMBEDDINGS.json");
Embeddings embeddings1 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
int size = text1.getSectionTexts().size();
int i = 1;
for (SectionText sectionText : text1.getSectionTexts()) {
indexSection("1-TEXT",
"1-TEXT.pdf",
sectionText.getSectionNumber(),
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
sectionText.getHeadline(),
sectionText.getText(),
null,
embeddings1.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
System.out.println("Index section " + i + "of" + size + "for document 1");
i++;
}
textResource = new ClassPathResource("files/2-TEXT.json");
var text2 = objectMapper.readValue(textResource.getInputStream(), Text.class);
embeddingsResource = new ClassPathResource("files/pdf-2.EMBEDDINGS.json");
var embeddings2 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
size = text1.getSectionTexts().size();
i = 1;
for (SectionText sectionText : text2.getSectionTexts()) {
indexSection("2-TEXT",
"2-TEXT.pdf",
sectionText.getSectionNumber(),
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
sectionText.getHeadline(),
sectionText.getText(),
null,
embeddings2.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
System.out.println("Index section " + i + "of" + size + "for document 2");
i++;
}
textResource = new ClassPathResource("files/3-TEXT.json");
var text3 = objectMapper.readValue(textResource.getInputStream(), Text.class);
embeddingsResource = new ClassPathResource("files/pdf-3.EMBEDDINGS.json");
var embeddings3 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
size = text1.getSectionTexts().size();
i = 1;
for (SectionText sectionText : text3.getSectionTexts()) {
indexSection("3-TEXT",
"3-TEXT.pdf",
sectionText.getSectionNumber(),
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
sectionText.getHeadline(),
sectionText.getText(),
null,
embeddings3.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
System.out.println("Index section " + i + "of" + size + "for document 3");
i++;
}
Thread.sleep(1000);
System.out.println("Finished wait");
for (Map.Entry<String, Float[]> vector : embeddings1.getEmbeddings().entrySet()) {
System.out.println("Document 1: section: " + vector.getKey());
List<Double> vectorAsList = new ArrayList<>();
for (Float f : vector.getValue()) {
vectorAsList.add((double) f);
}
var knnSearchQuery = new KnnSearchQuery.Builder().numCandidates(100).k(10).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
KnnSearchRequest request = new KnnSearchRequest.Builder().knn(knnSearchQuery).index(TenantContext.getTenantId()).fields("filename").build();
var resp = clientCache.getClient().knnSearch(request, IndexDocument.class);
var resp = clientCache.getClient().search(sr, JsonData.class);
resp.hits().hits().forEach(hit -> {
System.out.println("Id: " + hit.id() + " Score:" + hit.score());
});
}
for (Map.Entry<String, Float[]> vector : embeddings2.getEmbeddings().entrySet()) {
System.out.println("Document 2: section: " + vector.getKey());
List<Double> vectorAsList = new ArrayList<>();
for (Float f : vector.getValue()) {
vectorAsList.add((double) f);
}
var knnSearchQuery = new KnnSearchQuery.Builder().numCandidates(100).k(10).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
KnnSearchRequest request = new KnnSearchRequest.Builder().knn(knnSearchQuery).index(TenantContext.getTenantId()).fields("filename").build();
var resp = clientCache.getClient().knnSearch(request, IndexDocument.class);
resp.hits().hits().forEach(hit -> {
System.out.println("Id: " + hit.id() + " Score:" + hit.score());
});
}
for (Map.Entry<String, Float[]> vector : embeddings3.getEmbeddings().entrySet()) {
System.out.println("Document 3: section: " + vector.getKey());
List<Double> vectorAsList = new ArrayList<>();
for (Float f : vector.getValue()) {
vectorAsList.add((double) f);
}
var knnSearchQuery = new KnnSearchQuery.Builder().numCandidates(100).k(10).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
KnnSearchRequest request = new KnnSearchRequest.Builder().knn(knnSearchQuery).index(TenantContext.getTenantId()).fields("filename").build();
var resp = clientCache.getClient().knnSearch(request, IndexDocument.class);
resp.hits().hits().forEach(hit -> {
System.out.println("Id: " + hit.id() + " Score:" + hit.score());
});
}
}
private void indexSection(String fileId,
private void indexDocument(String fileId,
String filename,
int sectionNr,
Set<Integer> pages,
String headline,
String text,
Map<String, String> fileAttributes,
Float[] embeddingsVector) {
Map<String, String> fileAttributes) {
var indexDocument = indexDocumentConverterService.convert(fileId, filename, sectionNr, pages, headline, text, fileAttributes, embeddingsVector);
var indexDocument = indexDocumentConverterService.convert(fileId, filename, fileAttributes);
documentIndexService.indexDocument(indexDocument);
}
private void indexSection(String fileId, int sectionNumber, String text, Set<Integer> pages, String headline, Double[] wordEmbeddingsVector) {
var indexSection = indexDocumentConverterService.convert(sectionNumber, text, pages, headline, wordEmbeddingsVector);
documentIndexService.indexSection(fileId, indexSection);
}
public void updateDocument(String fileId, String assignee, boolean deleted, boolean archived, String workflowStatus, Map<String, String> fileAttributes) {
var updateDocument = indexDocumentConverterService.convertUpdateDocument(assignee, deleted, archived, workflowStatus, fileAttributes);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff