abhi001vj commited on
Commit
1d3f9ab
β€’
1 Parent(s): a5f94e4

Fixed the pinecone retrieval issue

Browse files
Files changed (2) hide show
  1. .gitattributes +1 -0
  2. app.py +99 -74
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ .streamlit/
app.py CHANGED
@@ -6,12 +6,13 @@ import sys
6
  import uuid
7
  from json import JSONDecodeError
8
  from pathlib import Path
 
9
 
10
  import pandas as pd
11
  import pinecone
12
  import streamlit as st
13
  from annotated_text import annotation
14
- from haystack import Document
15
  from haystack.document_stores import PineconeDocumentStore
16
  from haystack.nodes import (
17
  DocxToTextConverter,
@@ -26,22 +27,48 @@ from haystack.pipelines import ExtractiveQAPipeline, Pipeline
26
  from markdown import markdown
27
  from sentence_transformers import SentenceTransformer
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # connect to pinecone environment
30
- pinecone.init(
31
- api_key=st.secrets["pinecone_apikey"],
32
- environment="us-west1-gcp"
33
- )
34
  index_name = "qa-demo-fast-384"
35
  # retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
36
  retriever_model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
37
- embedding_dim=384
 
 
38
  preprocessor = PreProcessor(
39
  clean_empty_lines=True,
40
  clean_whitespace=True,
41
  clean_header_footer=False,
42
  split_by="word",
43
  split_length=100,
44
- split_respect_sentence_boundary=True
45
  )
46
  file_type_classifier = FileTypeClassifier()
47
  text_converter = TextConverter()
@@ -53,65 +80,50 @@ if index_name not in pinecone.list_indexes():
53
  # delete the current index and create the new index if it does not exist
54
  for delete_index in pinecone.list_indexes():
55
  pinecone.delete_index(delete_index)
56
- pinecone.create_index(
57
- index_name,
58
- dimension=embedding_dim,
59
- metric="cosine"
60
- )
61
 
62
  # connect to abstractive-question-answering index we created
63
  index = pinecone.Index(index_name)
64
 
65
- FILE_UPLOAD_PATH= "./data/uploads/"
66
  os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
67
- # @st.cache
 
68
  def create_doc_store():
69
  document_store = PineconeDocumentStore(
70
- api_key= st.secrets["pinecone_apikey"],
71
  index=index_name,
72
  similarity="cosine",
73
- embedding_dim=embedding_dim
74
  )
75
  return document_store
76
 
77
- # @st.cache
78
- # def create_pipe(document_store):
79
- # retriever = EmbeddingRetriever(
80
- # document_store=document_store,
81
- # embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
82
- # model_format="sentence_transformers",
83
- # )
84
- # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
85
- # pipe = ExtractiveQAPipeline(reader, retriever)
86
- # return pipe
87
 
88
  def query(pipe, question, top_k_reader, top_k_retriever):
89
  res = pipe.run(
90
- query=question, params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}}
 
91
  )
92
- answer_df = []
93
- # for r in res['answers']:
94
- # ans_dict = res['answers'][0].meta
95
- # ans_dict["answer"] = r.context
96
- # answer_df.append(ans_dict)
97
- # result = pd.DataFrame(answer_df)
98
- # result.columns = ["Source","Title","Year","Link","Answer"]
99
- # result[["Answer","Link","Source","Title","Year"]]
100
  return res
101
 
 
102
  document_store = create_doc_store()
103
  # pipe = create_pipe(document_store)
104
 
105
  retriever = EmbeddingRetriever(
106
- document_store=document_store,
107
- embedding_model=retriever_model,
108
- model_format="sentence_transformers",
109
  )
110
  # load the retriever model from huggingface model hub
111
  sentence_encoder = SentenceTransformer(retriever_model)
112
 
113
  reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
114
- pipe = ExtractiveQAPipeline(reader, retriever)
 
 
 
 
115
 
116
 
117
  indexing_pipeline_with_classification = Pipeline()
@@ -133,20 +145,29 @@ indexing_pipeline_with_classification.add_node(
133
  inputs=["TextConverter", "PdfConverter", "DocxConverter"],
134
  )
135
 
 
136
  def set_state_if_absent(key, value):
137
  if key not in st.session_state:
138
  st.session_state[key] = value
139
 
 
140
  # Adjust to a question that you would like users to see in the search bar when they load the UI:
141
- DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics.")
142
- DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "7% more remote workers have been at their current organization for 5 years or fewer")
 
 
 
 
 
143
 
144
  # Sliders
145
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
146
  DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
147
 
148
 
149
- st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")
 
 
150
 
151
  # Persistent state
152
  set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
@@ -160,6 +181,7 @@ def reset_results(*args):
160
  st.session_state.results = None
161
  st.session_state.raw_json = None
162
 
 
163
  # Title
164
  st.write("# Haystack Search Demo")
165
  st.markdown(
@@ -187,12 +209,16 @@ for data_file in data_files:
187
  f.write(data_file.getbuffer())
188
  ALL_FILES.append(file_path)
189
  st.sidebar.write(str(data_file.name) + "    βœ… ")
190
- META_DATA.append({"filename":data_file.name})
191
-
 
 
192
 
193
  if len(ALL_FILES) > 0:
194
  # document_store.update_embeddings(retriever, update_existing_embeddings=False)
195
- docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)["documents"]
 
 
196
  index_name = "qa_demo"
197
  # we will use batches of 64
198
  batch_size = 128
@@ -204,7 +230,7 @@ if len(ALL_FILES) > 0:
204
  upload_count = 0
205
  for i in range(0, len(docs), batch_size):
206
  # find end of batch
207
- i_end = min(i+batch_size, len(docs))
208
  # extract batch
209
  batch = [doc.content for doc in docs[i:i_end]]
210
  # generate embeddings for batch
@@ -222,10 +248,10 @@ if len(ALL_FILES) > 0:
222
  to_upsert = list(zip(ids, emb, meta))
223
  # upsert/insert these records to pinecone
224
  _ = index.upsert(vectors=to_upsert)
225
- upload_count+=batch_size
226
- upload_percentage = min(int((upload_count/len(docs))*100), 100)
227
  my_bar.progress(upload_percentage)
228
-
229
  top_k_reader = st.sidebar.slider(
230
  "Max. number of answers",
231
  min_value=1,
@@ -251,12 +277,12 @@ top_k_retriever = st.sidebar.slider(
251
  # raw_json = upload_doc(data_file)
252
 
253
  question = st.text_input(
254
- value=st.session_state.question,
255
- max_chars=100,
256
- on_change=reset_results,
257
- label="question",
258
- label_visibility="hidden",
259
- )
260
  col1, col2 = st.columns(2)
261
  col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
262
  col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
@@ -265,23 +291,21 @@ col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html
265
  run_pressed = col1.button("Run")
266
  if run_pressed:
267
 
268
- run_query = (
269
- run_pressed or question != st.session_state.question
270
- )
271
  # Get results for query
272
  if run_query and question:
273
  reset_results()
274
  st.session_state.question = question
275
 
276
- with st.spinner(
277
- "🧠 &nbsp;&nbsp; Performing neural search on documents... \n "
278
- ):
279
  try:
280
- st.session_state.results = query(
281
  pipe, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever
282
  )
283
  except JSONDecodeError as je:
284
- st.error("πŸ‘“ &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
 
 
285
  except Exception as e:
286
  logging.exception(e)
287
  if "The server is busy processing requests" in str(e) or "503" in str(e):
@@ -294,23 +318,24 @@ if st.session_state.results:
294
 
295
  st.write("## Results:")
296
 
297
- for count, result in enumerate(st.session_state.results['answers']):
298
  answer, context = result.answer, result.context
299
  start_idx = context.find(answer)
300
  end_idx = start_idx + len(answer)
301
  # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
302
  try:
303
- source = f"[{result.meta['Title']}]({result.meta['link']})"
304
  st.write(
305
- markdown(f'**Source:** {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
306
- unsafe_allow_html=True,
307
- )
 
 
308
  except:
309
- filename = result.meta.get('filename', "")
310
  st.write(
311
- markdown(f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
312
- unsafe_allow_html=True,
 
 
313
  )
314
-
315
-
316
-
 
6
  import uuid
7
  from json import JSONDecodeError
8
  from pathlib import Path
9
+ from typing import List, Optional
10
 
11
  import pandas as pd
12
  import pinecone
13
  import streamlit as st
14
  from annotated_text import annotation
15
+ from haystack import BaseComponent, Document
16
  from haystack.document_stores import PineconeDocumentStore
17
  from haystack.nodes import (
18
  DocxToTextConverter,
 
27
  from markdown import markdown
28
  from sentence_transformers import SentenceTransformer
29
 
30
+
31
+ class PineconeSearch(BaseComponent):
32
+ outgoing_edges = 1
33
+
34
+ def run(self, query: str, top_k: Optional[int]):
35
+ # process the inputs
36
+ vector_embedding = emb_model.encode(query).tolist()
37
+ response = index.query([vector_embedding], top_k=top_k, include_metadata=True)
38
+ docs = [
39
+ Document(
40
+ content=d["metadata"]["text"],
41
+ meta={
42
+ "title": d["metadata"]["filename"],
43
+ "context": d["metadata"]["text"],
44
+ "_split_id": d["metadata"]["_split_id"],
45
+ },
46
+ )
47
+ for d in response["matches"]
48
+ ]
49
+ output = {"documents": docs, "query": query}
50
+ return output, "output_1"
51
+
52
+ def run_batch(self, queries: List[str], top_k: Optional[int]):
53
+
54
+ return {}, "output_1"
55
+
56
+
57
  # connect to pinecone environment
58
+ pinecone.init(api_key=st.secrets["pinecone_apikey"], environment="us-west1-gcp")
 
 
 
59
  index_name = "qa-demo-fast-384"
60
  # retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
61
  retriever_model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
62
+ emb_model = SentenceTransformer(retriever_model)
63
+
64
+ embedding_dim = 384
65
  preprocessor = PreProcessor(
66
  clean_empty_lines=True,
67
  clean_whitespace=True,
68
  clean_header_footer=False,
69
  split_by="word",
70
  split_length=100,
71
+ split_respect_sentence_boundary=True,
72
  )
73
  file_type_classifier = FileTypeClassifier()
74
  text_converter = TextConverter()
 
80
  # delete the current index and create the new index if it does not exist
81
  for delete_index in pinecone.list_indexes():
82
  pinecone.delete_index(delete_index)
83
+ pinecone.create_index(index_name, dimension=embedding_dim, metric="cosine")
 
 
 
 
84
 
85
  # connect to abstractive-question-answering index we created
86
  index = pinecone.Index(index_name)
87
 
88
+ FILE_UPLOAD_PATH = "./data/uploads/"
89
  os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
90
+
91
+
92
  def create_doc_store():
93
  document_store = PineconeDocumentStore(
94
+ api_key=st.secrets["pinecone_apikey"],
95
  index=index_name,
96
  similarity="cosine",
97
+ embedding_dim=embedding_dim,
98
  )
99
  return document_store
100
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  def query(pipe, question, top_k_reader, top_k_retriever):
103
  res = pipe.run(
104
+ query=question,
105
+ params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}},
106
  )
 
 
 
 
 
 
 
 
107
  return res
108
 
109
+
110
  document_store = create_doc_store()
111
  # pipe = create_pipe(document_store)
112
 
113
  retriever = EmbeddingRetriever(
114
+ document_store=document_store,
115
+ embedding_model=retriever_model,
116
+ model_format="sentence_transformers",
117
  )
118
  # load the retriever model from huggingface model hub
119
  sentence_encoder = SentenceTransformer(retriever_model)
120
 
121
  reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
122
+ # pipe = ExtractiveQAPipeline(reader, retriever)
123
+ # Custom built extractive QA pipeline
124
+ pipe = Pipeline()
125
+ pipe.add_node(component=PineconeSearch(), name="Retriever", inputs=["Query"])
126
+ pipe.add_node(component=reader, name="Reader", inputs=["Retriever"])
127
 
128
 
129
  indexing_pipeline_with_classification = Pipeline()
 
145
  inputs=["TextConverter", "PdfConverter", "DocxConverter"],
146
  )
147
 
148
+
149
  def set_state_if_absent(key, value):
150
  if key not in st.session_state:
151
  st.session_state[key] = value
152
 
153
+
154
  # Adjust to a question that you would like users to see in the search bar when they load the UI:
155
+ DEFAULT_QUESTION_AT_STARTUP = os.getenv(
156
+ "DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics."
157
+ )
158
+ DEFAULT_ANSWER_AT_STARTUP = os.getenv(
159
+ "DEFAULT_ANSWER_AT_STARTUP",
160
+ "7% more remote workers have been at their current organization for 5 years or fewer",
161
+ )
162
 
163
  # Sliders
164
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
165
  DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
166
 
167
 
168
+ st.set_page_config(
169
+ page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png"
170
+ )
171
 
172
  # Persistent state
173
  set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
 
181
  st.session_state.results = None
182
  st.session_state.raw_json = None
183
 
184
+
185
  # Title
186
  st.write("# Haystack Search Demo")
187
  st.markdown(
 
209
  f.write(data_file.getbuffer())
210
  ALL_FILES.append(file_path)
211
  st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; βœ… ")
212
+ META_DATA.append({"filename": data_file.name})
213
+
214
+ data_files = []
215
+
216
 
217
  if len(ALL_FILES) > 0:
218
  # document_store.update_embeddings(retriever, update_existing_embeddings=False)
219
+ docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)[
220
+ "documents"
221
+ ]
222
  index_name = "qa_demo"
223
  # we will use batches of 64
224
  batch_size = 128
 
230
  upload_count = 0
231
  for i in range(0, len(docs), batch_size):
232
  # find end of batch
233
+ i_end = min(i + batch_size, len(docs))
234
  # extract batch
235
  batch = [doc.content for doc in docs[i:i_end]]
236
  # generate embeddings for batch
 
248
  to_upsert = list(zip(ids, emb, meta))
249
  # upsert/insert these records to pinecone
250
  _ = index.upsert(vectors=to_upsert)
251
+ upload_count += batch_size
252
+ upload_percentage = min(int((upload_count / len(docs)) * 100), 100)
253
  my_bar.progress(upload_percentage)
254
+
255
  top_k_reader = st.sidebar.slider(
256
  "Max. number of answers",
257
  min_value=1,
 
277
  # raw_json = upload_doc(data_file)
278
 
279
  question = st.text_input(
280
+ value=st.session_state.question,
281
+ max_chars=100,
282
+ on_change=reset_results,
283
+ label="question",
284
+ label_visibility="hidden",
285
+ )
286
  col1, col2 = st.columns(2)
287
  col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
288
  col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
 
291
  run_pressed = col1.button("Run")
292
  if run_pressed:
293
 
294
+ run_query = run_pressed or question != st.session_state.question
 
 
295
  # Get results for query
296
  if run_query and question:
297
  reset_results()
298
  st.session_state.question = question
299
 
300
+ with st.spinner("🧠 &nbsp;&nbsp; Performing neural search on documents... \n "):
 
 
301
  try:
302
+ st.session_state.results = query(
303
  pipe, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever
304
  )
305
  except JSONDecodeError as je:
306
+ st.error(
307
+ "πŸ‘“ &nbsp;&nbsp; An error occurred reading the results. Is the document store working?"
308
+ )
309
  except Exception as e:
310
  logging.exception(e)
311
  if "The server is busy processing requests" in str(e) or "503" in str(e):
 
318
 
319
  st.write("## Results:")
320
 
321
+ for count, result in enumerate(st.session_state.results["answers"]):
322
  answer, context = result.answer, result.context
323
  start_idx = context.find(answer)
324
  end_idx = start_idx + len(answer)
325
  # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
326
  try:
327
+ filename = result.meta["title"]
328
  st.write(
329
+ markdown(
330
+ f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '
331
+ ),
332
+ unsafe_allow_html=True,
333
+ )
334
  except:
335
+ filename = result.meta.get("filename", "")
336
  st.write(
337
+ markdown(
338
+ f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '
339
+ ),
340
+ unsafe_allow_html=True,
341
  )