hotfix: Tried parent child
This commit is contained in:
parent
c8f1f1255e
commit
17aaf2e639
@ -0,0 +1,18 @@
|
||||
package com.iqser.red.service.search.v1.server.model;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor(access = AccessLevel.PRIVATE)
|
||||
@AllArgsConstructor(access = AccessLevel.PRIVATE)
|
||||
@SuppressWarnings("serial")
|
||||
public class ChildObject {
|
||||
|
||||
private IndexSection section;
|
||||
private JoinField joinType;
|
||||
}
|
||||
@ -11,6 +11,6 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
public class Embeddings {
|
||||
|
||||
private Map<String, Float[]> embeddings;
|
||||
private Map<String, Double[]> embeddings;
|
||||
|
||||
}
|
||||
|
||||
@ -2,9 +2,7 @@ package com.iqser.red.service.search.v1.server.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -19,18 +17,10 @@ import lombok.NoArgsConstructor;
|
||||
@SuppressWarnings("serial")
|
||||
public class IndexDocument implements Serializable {
|
||||
|
||||
|
||||
private String fileId;
|
||||
private String filename;
|
||||
private int sectionNumber;
|
||||
|
||||
private String text;
|
||||
private Set<Integer> pages;
|
||||
private String headline;
|
||||
|
||||
@Builder.Default
|
||||
private List<IndexFileAttribute> fileAttributes = new ArrayList<>();
|
||||
|
||||
private Float[] wordEmbeddingsVector = new Float[383];
|
||||
|
||||
}
|
||||
|
||||
@ -21,4 +21,6 @@ public class IndexSection implements Serializable {
|
||||
private Set<Integer> pages;
|
||||
private String headline;
|
||||
|
||||
private Double[] wordEmbeddingsVector = new Double[383];
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,18 @@
|
||||
package com.iqser.red.service.search.v1.server.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class JoinField {
|
||||
|
||||
@Builder.Default
|
||||
private String name = "section";
|
||||
private String parent;
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.iqser.red.service.search.v1.server.model;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor(access = AccessLevel.PRIVATE)
|
||||
@AllArgsConstructor(access = AccessLevel.PRIVATE)
|
||||
@SuppressWarnings("serial")
|
||||
public class ParentObject {
|
||||
|
||||
private IndexDocument document;
|
||||
|
||||
@Builder.Default
|
||||
private String joinType = "document";
|
||||
|
||||
}
|
||||
@ -10,21 +10,29 @@ import org.springframework.stereotype.Service;
|
||||
import com.iqser.red.service.search.v1.server.model.IndexDocument;
|
||||
import com.iqser.red.service.search.v1.server.model.IndexDocumentUpdate;
|
||||
import com.iqser.red.service.search.v1.server.model.IndexFileAttribute;
|
||||
import com.iqser.red.service.search.v1.server.model.IndexSection;
|
||||
|
||||
@Service
|
||||
public class IndexDocumentConverterService {
|
||||
|
||||
public IndexDocument convert(String fileId, String filename, int sectionNr, Set<Integer> pages, String headline, String text, Map<String, String> fileAttributes, Float[] embeddingsVector) {
|
||||
public IndexDocument convert(String fileId, String filename, Map<String, String> fileAttributes) {
|
||||
|
||||
return IndexDocument.builder()
|
||||
.fileId(fileId)
|
||||
.filename(filename)
|
||||
.sectionNumber(sectionNr)
|
||||
.fileAttributes(convertFileAttributes(fileAttributes))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public IndexSection convert(int sectionNumber, String text, Set<Integer> pages, String headline, Double[] wordEmbeddingsVector) {
|
||||
|
||||
return IndexSection.builder()
|
||||
.sectionNumber(sectionNumber)
|
||||
.text(text)
|
||||
.pages(pages)
|
||||
.headline(headline)
|
||||
.text(text)
|
||||
.fileAttributes(convertFileAttributes(fileAttributes))
|
||||
.wordEmbeddingsVector(embeddingsVector)
|
||||
.wordEmbeddingsVector(wordEmbeddingsVector)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@ -6,7 +6,11 @@ import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.search.v1.server.exception.IndexException;
|
||||
import com.iqser.red.service.search.v1.server.model.ChildObject;
|
||||
import com.iqser.red.service.search.v1.server.model.IndexDocument;
|
||||
import com.iqser.red.service.search.v1.server.model.IndexSection;
|
||||
import com.iqser.red.service.search.v1.server.model.JoinField;
|
||||
import com.iqser.red.service.search.v1.server.model.ParentObject;
|
||||
import com.iqser.red.service.search.v1.server.multitenancy.TenantContext;
|
||||
import com.iqser.red.service.search.v1.server.service.DocumentIndexService;
|
||||
import com.iqser.red.service.search.v1.server.settings.ElasticsearchSettings;
|
||||
@ -33,12 +37,26 @@ public class DocumentIndexServiceImpl implements DocumentIndexService {
|
||||
try {
|
||||
clientCache.getClient()
|
||||
.index(i -> i.index(TenantContext.getTenantId())
|
||||
.id(indexDocument.getFileId()+"_"+indexDocument.getSectionNumber())
|
||||
.id(indexDocument.getFileId())
|
||||
.refresh(Refresh._DESERIALIZER.parse(settings.getRefreshPolicy()))
|
||||
.document(indexDocument));
|
||||
.document(ParentObject.builder().document(indexDocument).build()));
|
||||
} catch (IOException | ElasticsearchException e) {
|
||||
throw IndexException.documentIndexError(indexDocument.getFileId(), e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void indexSection(String parentFileId, IndexSection indexSection){
|
||||
try {
|
||||
clientCache.getClient()
|
||||
.index(i -> i.index(TenantContext.getTenantId())
|
||||
.id(parentFileId + "_" + indexSection.getSectionNumber())
|
||||
.routing(parentFileId)
|
||||
.refresh(Refresh._DESERIALIZER.parse(settings.getRefreshPolicy()))
|
||||
.document(ChildObject.builder().section(indexSection).joinType(JoinField.builder().parent(parentFileId).build()).build()));
|
||||
} catch (IOException | ElasticsearchException e) {
|
||||
throw IndexException.documentIndexError(parentFileId, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,44 +1,58 @@
|
||||
{
|
||||
"properties": {
|
||||
"fileId": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"filename": {
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets",
|
||||
"analyzer": "filename_analyzer"
|
||||
},
|
||||
"sectionNumber": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"fileAttributes": {
|
||||
"type": "nested",
|
||||
"include_in_parent": true,
|
||||
"document": {
|
||||
"properties": {
|
||||
"name": {
|
||||
"fileId": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"value": {
|
||||
"filename": {
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets"
|
||||
"term_vector": "with_positions_offsets",
|
||||
"analyzer": "filename_analyzer"
|
||||
},
|
||||
"fileAttributes": {
|
||||
"type": "nested",
|
||||
"include_in_parent": true,
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"value": {
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"headline": {
|
||||
"type": "text"
|
||||
"section": {
|
||||
"properties": {
|
||||
"sectionNumber": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"headline": {
|
||||
"type": "text"
|
||||
},
|
||||
"pages": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"text": {
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets"
|
||||
},
|
||||
"wordEmbeddingsVector": {
|
||||
"type": "dense_vector",
|
||||
"dims": 384,
|
||||
"index": true,
|
||||
"similarity": "cosine"
|
||||
}
|
||||
}
|
||||
},
|
||||
"pages": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"text": {
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets"
|
||||
},
|
||||
"wordEmbeddingsVector": {
|
||||
"type": "dense_vector",
|
||||
"dims": 384,
|
||||
"index": true,
|
||||
"similarity": "cosine"
|
||||
"joinType": {
|
||||
"type": "join",
|
||||
"relations": {
|
||||
"document": "section"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -19,11 +19,14 @@ import com.iqser.red.service.search.v1.server.client.FileStatusProcessingUpdateC
|
||||
import com.iqser.red.service.search.v1.server.client.IndexInformationClient;
|
||||
import com.iqser.red.service.search.v1.server.model.Embeddings;
|
||||
import com.iqser.red.service.search.v1.server.model.IndexDocument;
|
||||
import com.iqser.red.service.search.v1.server.model.IndexSection;
|
||||
import com.iqser.red.service.search.v1.server.model.SectionText;
|
||||
import com.iqser.red.service.search.v1.server.model.Text;
|
||||
import com.iqser.red.service.search.v1.server.multitenancy.TenantContext;
|
||||
import com.iqser.red.service.search.v1.server.service.elasticsearch.DocumentIndexServiceImpl;
|
||||
import com.iqser.red.service.search.v1.server.service.elasticsearch.EsClientCache;
|
||||
|
||||
import co.elastic.clients.elasticsearch._types.KnnQuery;
|
||||
import co.elastic.clients.elasticsearch._types.query_dsl.QueryBuilders;
|
||||
import co.elastic.clients.elasticsearch.core.KnnSearchRequest;
|
||||
import co.elastic.clients.elasticsearch.core.SearchRequest;
|
||||
@ -39,7 +42,7 @@ public class ElasticsearchTest extends AbstractElasticsearchIntegrationTest {
|
||||
private ObjectMapper objectMapper;
|
||||
|
||||
@Autowired
|
||||
private DocumentIndexService documentIndexService;
|
||||
private DocumentIndexServiceImpl documentIndexService;
|
||||
|
||||
|
||||
@Autowired
|
||||
@ -77,13 +80,283 @@ public class ElasticsearchTest extends AbstractElasticsearchIntegrationTest {
|
||||
|
||||
private final long UPDATE_TIMER = 1500;
|
||||
|
||||
|
||||
|
||||
|
||||
// See https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html
|
||||
|
||||
|
||||
|
||||
// @Test
|
||||
// @SneakyThrows
|
||||
// public void testWordEmbeddingsScript() {
|
||||
//
|
||||
// ClassPathResource textResource = new ClassPathResource("files/1-TEXT.json");
|
||||
// Text text1 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
//
|
||||
// ClassPathResource embeddingsResource = new ClassPathResource("files/pdf-1.EMBEDDINGS.json");
|
||||
// Embeddings embeddings1 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
//
|
||||
// int size = text1.getSectionTexts().size();
|
||||
// int i = 1;
|
||||
// for (SectionText sectionText : text1.getSectionTexts()) {
|
||||
// indexSection("1-TEXT",
|
||||
// "1-TEXT.pdf",
|
||||
// sectionText.getSectionNumber(),
|
||||
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
// sectionText.getHeadline(),
|
||||
// sectionText.getText(),
|
||||
// null,
|
||||
// embeddings1.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
// System.out.println("Index section " + i + "of" + size + "for document 1");
|
||||
// i++;
|
||||
// }
|
||||
//
|
||||
// textResource = new ClassPathResource("files/2-TEXT.json");
|
||||
// var text2 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
//
|
||||
// embeddingsResource = new ClassPathResource("files/pdf-2.EMBEDDINGS.json");
|
||||
// var embeddings2 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
//
|
||||
// size = text1.getSectionTexts().size();
|
||||
// i = 1;
|
||||
// for (SectionText sectionText : text2.getSectionTexts()) {
|
||||
// indexSection("2-TEXT",
|
||||
// "2-TEXT.pdf",
|
||||
// sectionText.getSectionNumber(),
|
||||
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
// sectionText.getHeadline(),
|
||||
// sectionText.getText(),
|
||||
// null,
|
||||
// embeddings2.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
// System.out.println("Index section " + i + "of" + size + "for document 2");
|
||||
// i++;
|
||||
// }
|
||||
//
|
||||
// textResource = new ClassPathResource("files/3-TEXT.json");
|
||||
// var text3 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
//
|
||||
// embeddingsResource = new ClassPathResource("files/pdf-3.EMBEDDINGS.json");
|
||||
// var embeddings3 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
//
|
||||
// size = text1.getSectionTexts().size();
|
||||
// i = 1;
|
||||
// for (SectionText sectionText : text3.getSectionTexts()) {
|
||||
// indexSection("3-TEXT",
|
||||
// "3-TEXT.pdf",
|
||||
// sectionText.getSectionNumber(),
|
||||
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
// sectionText.getHeadline(),
|
||||
// sectionText.getText(),
|
||||
// null,
|
||||
// embeddings3.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
// System.out.println("Index section " + i + "of" + size + "for document 3");
|
||||
// i++;
|
||||
// }
|
||||
//
|
||||
//
|
||||
// Thread.sleep(240000);
|
||||
// System.out.println("Finished wait");
|
||||
//
|
||||
// for (Map.Entry<String, Float[]> vector : embeddings1.getEmbeddings().entrySet()) {
|
||||
// System.out.println("Document 1: section: " + vector.getKey());
|
||||
// queryWithScript(vector.getValue());
|
||||
//
|
||||
// }
|
||||
//
|
||||
//
|
||||
// for (Map.Entry<String, Float[]> vector : embeddings2.getEmbeddings().entrySet()) {
|
||||
// System.out.println("Document 2: section: " + vector.getKey());
|
||||
// queryWithScript(vector.getValue());
|
||||
//
|
||||
// }
|
||||
//
|
||||
//
|
||||
// for (Map.Entry<String, Float[]> vector : embeddings3.getEmbeddings().entrySet()) {
|
||||
// System.out.println("Document 3: section: " + vector.getKey());
|
||||
// queryWithScript(vector.getValue());
|
||||
//
|
||||
// }
|
||||
//
|
||||
//
|
||||
//
|
||||
// }
|
||||
//
|
||||
//
|
||||
// @SneakyThrows
|
||||
// private void queryWithScript(Float[] embeddingsVector){
|
||||
// final String vector = objectMapper.writeValueAsString(embeddingsVector);
|
||||
//
|
||||
//
|
||||
// var query = QueryBuilders.scriptScore(s -> s.query(QueryBuilders.matchAll().build()._toQuery()).script(si -> si.inline(i -> i.source("""
|
||||
// cosineSimilarity(params.query_vector, 'wordEmbeddingsVector') + 1.0
|
||||
// """).params("query_vector", JsonData.fromJson(vector)))));
|
||||
//
|
||||
//
|
||||
//
|
||||
// SearchRequest request = new SearchRequest.Builder().query(query)
|
||||
// .from(0)
|
||||
// .size(50)
|
||||
// .trackScores(true)
|
||||
// .build();
|
||||
//
|
||||
//
|
||||
// SearchResponse response = clientCache.getClient().search(request, IndexDocument.class);
|
||||
//
|
||||
// response.hits().hits().stream().forEach(hit -> {
|
||||
// var h = (Hit) hit;
|
||||
// System.out.println("Id: " + h.id() + " Score:" + h.score());
|
||||
// });
|
||||
//
|
||||
// }
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// @Test
|
||||
// @SneakyThrows
|
||||
// public void testWordEmbeddingsKnn() {
|
||||
//
|
||||
// ClassPathResource textResource = new ClassPathResource("files/1-TEXT.json");
|
||||
// Text text1 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
//
|
||||
// ClassPathResource embeddingsResource = new ClassPathResource("files/pdf-1.EMBEDDINGS.json");
|
||||
// Embeddings embeddings1 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
//
|
||||
// int size = text1.getSectionTexts().size();
|
||||
// int i = 1;
|
||||
// for (SectionText sectionText : text1.getSectionTexts()) {
|
||||
// indexSection("1-TEXT",
|
||||
// "1-TEXT.pdf",
|
||||
// sectionText.getSectionNumber(),
|
||||
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
// sectionText.getHeadline(),
|
||||
// sectionText.getText(),
|
||||
// null,
|
||||
// embeddings1.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
// System.out.println("Index section " + i + "of" + size + "for document 1");
|
||||
// i++;
|
||||
// }
|
||||
//
|
||||
// textResource = new ClassPathResource("files/2-TEXT.json");
|
||||
// var text2 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
//
|
||||
// embeddingsResource = new ClassPathResource("files/pdf-2.EMBEDDINGS.json");
|
||||
// var embeddings2 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
//
|
||||
// size = text1.getSectionTexts().size();
|
||||
// i = 1;
|
||||
// for (SectionText sectionText : text2.getSectionTexts()) {
|
||||
// indexSection("2-TEXT",
|
||||
// "2-TEXT.pdf",
|
||||
// sectionText.getSectionNumber(),
|
||||
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
// sectionText.getHeadline(),
|
||||
// sectionText.getText(),
|
||||
// null,
|
||||
// embeddings2.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
// System.out.println("Index section " + i + "of" + size + "for document 2");
|
||||
// i++;
|
||||
// }
|
||||
//
|
||||
// textResource = new ClassPathResource("files/3-TEXT.json");
|
||||
// var text3 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
//
|
||||
// embeddingsResource = new ClassPathResource("files/pdf-3.EMBEDDINGS.json");
|
||||
// var embeddings3 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
//
|
||||
// size = text1.getSectionTexts().size();
|
||||
// i = 1;
|
||||
// for (SectionText sectionText : text3.getSectionTexts()) {
|
||||
// indexSection("3-TEXT",
|
||||
// "3-TEXT.pdf",
|
||||
// sectionText.getSectionNumber(),
|
||||
// sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
// sectionText.getHeadline(),
|
||||
// sectionText.getText(),
|
||||
// null,
|
||||
// embeddings3.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
// System.out.println("Index section " + i + "of" + size + "for document 3");
|
||||
// i++;
|
||||
// }
|
||||
//
|
||||
//
|
||||
// Thread.sleep(1000);
|
||||
// System.out.println("Finished wait");
|
||||
//
|
||||
// for (Map.Entry<String, Float[]> vector : embeddings1.getEmbeddings().entrySet()) {
|
||||
// System.out.println("Document 1: section: " + vector.getKey());
|
||||
// List<Double> vectorAsList = new ArrayList<>();
|
||||
// for (Float f : vector.getValue()) {
|
||||
// vectorAsList.add((double) f);
|
||||
// }
|
||||
//
|
||||
// KnnQuery k = new KnnQuery.Builder().k(10).numCandidates(100).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
|
||||
// SearchRequest sr = new SearchRequest.Builder().knn(k)
|
||||
// .from(0)
|
||||
// .size(50)
|
||||
// .trackScores(true)
|
||||
// .build();
|
||||
//
|
||||
// var resp = clientCache.getClient().search(sr, IndexDocument.class);
|
||||
//
|
||||
// resp.hits().hits().forEach(hit -> {
|
||||
// System.out.println("Id: " + hit.id() + " Score:" + hit.score());
|
||||
// });
|
||||
//
|
||||
// }
|
||||
//
|
||||
//
|
||||
// for (Map.Entry<String, Float[]> vector : embeddings2.getEmbeddings().entrySet()) {
|
||||
// System.out.println("Document 2: section: " + vector.getKey());
|
||||
// List<Double> vectorAsList = new ArrayList<>();
|
||||
// for (Float f : vector.getValue()) {
|
||||
// vectorAsList.add((double) f);
|
||||
// }
|
||||
// KnnQuery k = new KnnQuery.Builder().k(10).numCandidates(100).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
|
||||
// SearchRequest sr = new SearchRequest.Builder().knn(k)
|
||||
// .from(0)
|
||||
// .size(50)
|
||||
// .trackScores(true)
|
||||
// .build();
|
||||
//
|
||||
// var resp = clientCache.getClient().search(sr, IndexDocument.class);
|
||||
//
|
||||
// resp.hits().hits().forEach(hit -> {
|
||||
// System.out.println("Id: " + hit.id() + " Score:" + hit.score());
|
||||
// });
|
||||
//
|
||||
// }
|
||||
//
|
||||
//
|
||||
// for (Map.Entry<String, Float[]> vector : embeddings3.getEmbeddings().entrySet()) {
|
||||
// System.out.println("Document 3: section: " + vector.getKey());
|
||||
// List<Double> vectorAsList = new ArrayList<>();
|
||||
// for (Float f : vector.getValue()) {
|
||||
// vectorAsList.add((double) f);
|
||||
// }
|
||||
// KnnQuery k = new KnnQuery.Builder().k(10).numCandidates(100).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
|
||||
// SearchRequest sr = new SearchRequest.Builder().knn(k)
|
||||
// .from(0)
|
||||
// .size(50)
|
||||
// .trackScores(true)
|
||||
// .build();
|
||||
//
|
||||
// var resp = clientCache.getClient().search(sr, IndexDocument.class);
|
||||
//
|
||||
// resp.hits().hits().forEach(hit -> {
|
||||
// System.out.println("Id: " + hit.id() + " Score:" + hit.score());
|
||||
// });
|
||||
//
|
||||
// }
|
||||
//
|
||||
//
|
||||
//
|
||||
// }
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testWordEmbeddingsScript() {
|
||||
public void testParentChild(){
|
||||
|
||||
ClassPathResource textResource = new ClassPathResource("files/1-TEXT.json");
|
||||
Text text1 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
@ -91,262 +364,55 @@ public class ElasticsearchTest extends AbstractElasticsearchIntegrationTest {
|
||||
ClassPathResource embeddingsResource = new ClassPathResource("files/pdf-1.EMBEDDINGS.json");
|
||||
Embeddings embeddings1 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
|
||||
int size = text1.getSectionTexts().size();
|
||||
int i = 1;
|
||||
for (SectionText sectionText : text1.getSectionTexts()) {
|
||||
indexSection("1-TEXT",
|
||||
"1-TEXT.pdf",
|
||||
sectionText.getSectionNumber(),
|
||||
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
sectionText.getHeadline(),
|
||||
sectionText.getText(),
|
||||
null,
|
||||
embeddings1.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
System.out.println("Index section " + i + "of" + size + "for document 1");
|
||||
i++;
|
||||
}
|
||||
|
||||
textResource = new ClassPathResource("files/2-TEXT.json");
|
||||
var text2 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
|
||||
embeddingsResource = new ClassPathResource("files/pdf-2.EMBEDDINGS.json");
|
||||
var embeddings2 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
|
||||
size = text1.getSectionTexts().size();
|
||||
i = 1;
|
||||
for (SectionText sectionText : text2.getSectionTexts()) {
|
||||
indexSection("2-TEXT",
|
||||
"2-TEXT.pdf",
|
||||
sectionText.getSectionNumber(),
|
||||
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
sectionText.getHeadline(),
|
||||
sectionText.getText(),
|
||||
null,
|
||||
embeddings2.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
System.out.println("Index section " + i + "of" + size + "for document 2");
|
||||
i++;
|
||||
}
|
||||
|
||||
textResource = new ClassPathResource("files/3-TEXT.json");
|
||||
var text3 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
|
||||
embeddingsResource = new ClassPathResource("files/pdf-3.EMBEDDINGS.json");
|
||||
var embeddings3 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
|
||||
size = text1.getSectionTexts().size();
|
||||
i = 1;
|
||||
for (SectionText sectionText : text3.getSectionTexts()) {
|
||||
indexSection("3-TEXT",
|
||||
"3-TEXT.pdf",
|
||||
sectionText.getSectionNumber(),
|
||||
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
sectionText.getHeadline(),
|
||||
sectionText.getText(),
|
||||
null,
|
||||
embeddings3.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
System.out.println("Index section " + i + "of" + size + "for document 3");
|
||||
i++;
|
||||
}
|
||||
indexDocument("fileId1", "FirstFile.pdf", Map.of("dossierTemplateId", "dossierTemplateId1", "dossierId", "dossierId2"));
|
||||
indexSection("fileId1", 1, text1.getSectionTexts().get(0).getText(),Set.of(1), text1.getSectionTexts().get(0).getHeadline(), embeddings1.getEmbeddings().get("1"));
|
||||
indexSection("fileId1", 1, text1.getSectionTexts().get(1).getText(),Set.of(1), text1.getSectionTexts().get(0).getHeadline(), embeddings1.getEmbeddings().get("2"));
|
||||
indexSection("fileId1", 1, text1.getSectionTexts().get(2).getText(),Set.of(1), text1.getSectionTexts().get(0).getHeadline(), embeddings1.getEmbeddings().get("3"));
|
||||
|
||||
|
||||
Thread.sleep(1000);
|
||||
System.out.println("Finished wait");
|
||||
var query = QueryBuilders.matchPhrasePrefix(a -> a.query("Abamectin comments").field("section.text"));
|
||||
|
||||
for (Map.Entry<String, Float[]> vector : embeddings1.getEmbeddings().entrySet()) {
|
||||
System.out.println("Document 1: section: " + vector.getKey());
|
||||
queryWithScript(vector.getValue());
|
||||
var fq = QueryBuilders.matchPhrasePrefix(a -> a.query("FirstFile.pdf").field("document.filename"));
|
||||
|
||||
}
|
||||
var hp = QueryBuilders.hasParent(a -> a.parentType("document").query(fq));
|
||||
|
||||
|
||||
for (Map.Entry<String, Float[]> vector : embeddings2.getEmbeddings().entrySet()) {
|
||||
System.out.println("Document 2: section: " + vector.getKey());
|
||||
queryWithScript(vector.getValue());
|
||||
|
||||
}
|
||||
|
||||
|
||||
for (Map.Entry<String, Float[]> vector : embeddings3.getEmbeddings().entrySet()) {
|
||||
System.out.println("Document 3: section: " + vector.getKey());
|
||||
queryWithScript(vector.getValue());
|
||||
|
||||
}
|
||||
var parentFiltered = QueryBuilders.bool(a -> a.must(query).filter(hp));
|
||||
|
||||
|
||||
|
||||
}
|
||||
SearchRequest sr = new SearchRequest.Builder().query(parentFiltered)
|
||||
.from(0)
|
||||
.size(50)
|
||||
.trackScores(true)
|
||||
.build();
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void queryWithScript(Float[] embeddingsVector){
|
||||
final String vector = objectMapper.writeValueAsString(embeddingsVector);
|
||||
|
||||
|
||||
var query = QueryBuilders.scriptScore(s -> s.query(QueryBuilders.matchAll().build()._toQuery()).script(si -> si.inline(i -> i.source("""
|
||||
cosineSimilarity(params.query_vector, 'wordEmbeddingsVector') + 1.0
|
||||
""").params("query_vector", JsonData.fromJson(vector)))));
|
||||
|
||||
|
||||
|
||||
SearchRequest request = new SearchRequest.Builder().query(query)
|
||||
.from(0)
|
||||
.size(50)
|
||||
.trackScores(true)
|
||||
.build();
|
||||
|
||||
|
||||
SearchResponse response = clientCache.getClient().search(request, IndexDocument.class);
|
||||
|
||||
response.hits().hits().stream().forEach(hit -> {
|
||||
var h = (Hit) hit;
|
||||
System.out.println("Id: " + h.id() + " Score:" + h.score());
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testWordEmbeddingsKnn() {
|
||||
|
||||
ClassPathResource textResource = new ClassPathResource("files/1-TEXT.json");
|
||||
Text text1 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
|
||||
ClassPathResource embeddingsResource = new ClassPathResource("files/pdf-1.EMBEDDINGS.json");
|
||||
Embeddings embeddings1 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
|
||||
int size = text1.getSectionTexts().size();
|
||||
int i = 1;
|
||||
for (SectionText sectionText : text1.getSectionTexts()) {
|
||||
indexSection("1-TEXT",
|
||||
"1-TEXT.pdf",
|
||||
sectionText.getSectionNumber(),
|
||||
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
sectionText.getHeadline(),
|
||||
sectionText.getText(),
|
||||
null,
|
||||
embeddings1.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
System.out.println("Index section " + i + "of" + size + "for document 1");
|
||||
i++;
|
||||
}
|
||||
|
||||
textResource = new ClassPathResource("files/2-TEXT.json");
|
||||
var text2 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
|
||||
embeddingsResource = new ClassPathResource("files/pdf-2.EMBEDDINGS.json");
|
||||
var embeddings2 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
|
||||
size = text1.getSectionTexts().size();
|
||||
i = 1;
|
||||
for (SectionText sectionText : text2.getSectionTexts()) {
|
||||
indexSection("2-TEXT",
|
||||
"2-TEXT.pdf",
|
||||
sectionText.getSectionNumber(),
|
||||
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
sectionText.getHeadline(),
|
||||
sectionText.getText(),
|
||||
null,
|
||||
embeddings2.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
System.out.println("Index section " + i + "of" + size + "for document 2");
|
||||
i++;
|
||||
}
|
||||
|
||||
textResource = new ClassPathResource("files/3-TEXT.json");
|
||||
var text3 = objectMapper.readValue(textResource.getInputStream(), Text.class);
|
||||
|
||||
embeddingsResource = new ClassPathResource("files/pdf-3.EMBEDDINGS.json");
|
||||
var embeddings3 = objectMapper.readValue(embeddingsResource.getInputStream(), Embeddings.class);
|
||||
|
||||
size = text1.getSectionTexts().size();
|
||||
i = 1;
|
||||
for (SectionText sectionText : text3.getSectionTexts()) {
|
||||
indexSection("3-TEXT",
|
||||
"3-TEXT.pdf",
|
||||
sectionText.getSectionNumber(),
|
||||
sectionText.getSectionAreas().stream().map(a -> a.getPage()).collect(Collectors.toSet()),
|
||||
sectionText.getHeadline(),
|
||||
sectionText.getText(),
|
||||
null,
|
||||
embeddings3.getEmbeddings().get(String.valueOf(sectionText.getSectionNumber())));
|
||||
System.out.println("Index section " + i + "of" + size + "for document 3");
|
||||
i++;
|
||||
}
|
||||
|
||||
|
||||
Thread.sleep(1000);
|
||||
System.out.println("Finished wait");
|
||||
|
||||
for (Map.Entry<String, Float[]> vector : embeddings1.getEmbeddings().entrySet()) {
|
||||
System.out.println("Document 1: section: " + vector.getKey());
|
||||
List<Double> vectorAsList = new ArrayList<>();
|
||||
for (Float f : vector.getValue()) {
|
||||
vectorAsList.add((double) f);
|
||||
}
|
||||
var knnSearchQuery = new KnnSearchQuery.Builder().numCandidates(100).k(10).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
|
||||
KnnSearchRequest request = new KnnSearchRequest.Builder().knn(knnSearchQuery).index(TenantContext.getTenantId()).fields("filename").build();
|
||||
var resp = clientCache.getClient().knnSearch(request, IndexDocument.class);
|
||||
var resp = clientCache.getClient().search(sr, JsonData.class);
|
||||
|
||||
resp.hits().hits().forEach(hit -> {
|
||||
System.out.println("Id: " + hit.id() + " Score:" + hit.score());
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
for (Map.Entry<String, Float[]> vector : embeddings2.getEmbeddings().entrySet()) {
|
||||
System.out.println("Document 2: section: " + vector.getKey());
|
||||
List<Double> vectorAsList = new ArrayList<>();
|
||||
for (Float f : vector.getValue()) {
|
||||
vectorAsList.add((double) f);
|
||||
}
|
||||
var knnSearchQuery = new KnnSearchQuery.Builder().numCandidates(100).k(10).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
|
||||
KnnSearchRequest request = new KnnSearchRequest.Builder().knn(knnSearchQuery).index(TenantContext.getTenantId()).fields("filename").build();
|
||||
var resp = clientCache.getClient().knnSearch(request, IndexDocument.class);
|
||||
|
||||
resp.hits().hits().forEach(hit -> {
|
||||
System.out.println("Id: " + hit.id() + " Score:" + hit.score());
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
for (Map.Entry<String, Float[]> vector : embeddings3.getEmbeddings().entrySet()) {
|
||||
System.out.println("Document 3: section: " + vector.getKey());
|
||||
List<Double> vectorAsList = new ArrayList<>();
|
||||
for (Float f : vector.getValue()) {
|
||||
vectorAsList.add((double) f);
|
||||
}
|
||||
var knnSearchQuery = new KnnSearchQuery.Builder().numCandidates(100).k(10).field("wordEmbeddingsVector").queryVector(vectorAsList).build();
|
||||
KnnSearchRequest request = new KnnSearchRequest.Builder().knn(knnSearchQuery).index(TenantContext.getTenantId()).fields("filename").build();
|
||||
var resp = clientCache.getClient().knnSearch(request, IndexDocument.class);
|
||||
|
||||
resp.hits().hits().forEach(hit -> {
|
||||
System.out.println("Id: " + hit.id() + " Score:" + hit.score());
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void indexSection(String fileId,
|
||||
private void indexDocument(String fileId,
|
||||
String filename,
|
||||
int sectionNr,
|
||||
Set<Integer> pages,
|
||||
String headline,
|
||||
String text,
|
||||
Map<String, String> fileAttributes,
|
||||
Float[] embeddingsVector) {
|
||||
Map<String, String> fileAttributes) {
|
||||
|
||||
var indexDocument = indexDocumentConverterService.convert(fileId, filename, sectionNr, pages, headline, text, fileAttributes, embeddingsVector);
|
||||
var indexDocument = indexDocumentConverterService.convert(fileId, filename, fileAttributes);
|
||||
documentIndexService.indexDocument(indexDocument);
|
||||
}
|
||||
|
||||
|
||||
private void indexSection(String fileId, int sectionNumber, String text, Set<Integer> pages, String headline, Double[] wordEmbeddingsVector) {
|
||||
|
||||
var indexSection = indexDocumentConverterService.convert(sectionNumber, text, pages, headline, wordEmbeddingsVector);
|
||||
documentIndexService.indexSection(fileId, indexSection);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public void updateDocument(String fileId, String assignee, boolean deleted, boolean archived, String workflowStatus, Map<String, String> fileAttributes) {
|
||||
|
||||
var updateDocument = indexDocumentConverterService.convertUpdateDocument(assignee, deleted, archived, workflowStatus, fileAttributes);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user