ola13 commited on
Commit
b6da1a8
1 Parent(s): e55d3fc

choose corpus

Browse files
Files changed (2) hide show
  1. .streamlit/config.toml +2 -0
  2. app.py +27 -67
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [theme]
2
+ base="light"
app.py CHANGED
@@ -1,10 +1,7 @@
1
- import http.client as http_client
2
  import json
3
- import logging
4
  import os
5
  import pprint
6
  import re
7
- import string
8
 
9
  import streamlit as st
10
  import streamlit.components.v1 as components
@@ -12,30 +9,15 @@ import requests
12
 
13
 
14
  pp = pprint.PrettyPrinter(indent=2)
 
 
 
15
  st.set_page_config(page_title="Gaia Search", layout="wide")
16
 
17
  os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
18
  with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
19
  file.write('[theme]\nbase="light"')
20
 
21
- LANG_MAPPING = {
22
- "Arabic": "ar",
23
- "Catalan": "ca",
24
- "Code": "code",
25
- "English": "en",
26
- "Spanish": "es",
27
- "French": "fr",
28
- "Indonesian": "id",
29
- "Indic": "indic",
30
- "Niger-Congo": "nigercongo",
31
- "Portuguese": "pt",
32
- "Vietnamese": "vi",
33
- "Chinese": "zh",
34
- "Detect Language": "detect_language",
35
- "All": "all",
36
- }
37
-
38
-
39
  st.sidebar.markdown(
40
  """
41
  <style>
@@ -71,25 +53,10 @@ st.sidebar.markdown(
71
  )
72
 
73
  query = st.sidebar.text_input(label="Search query", value="")
74
- language = st.sidebar.selectbox(
75
- "Language",
76
- (
77
- "Arabic",
78
- "Catalan",
79
- "Code",
80
- "English",
81
- "Spanish",
82
- "French",
83
- "Indonesian",
84
- "Indic",
85
- "Niger-Congo",
86
- "Portuguese",
87
- "Vietnamese",
88
- "Chinese",
89
- "Detect Language",
90
- "All",
91
- ),
92
- index=3,
93
  )
94
  max_results = st.sidebar.slider(
95
  "Maximum Number of Results",
@@ -117,15 +84,14 @@ text-align: center;
117
  st.sidebar.markdown(footer, unsafe_allow_html=True)
118
 
119
 
120
- def scisearch(query, language, num_results=10):
121
  try:
 
122
  query = query.strip()
123
  if query == "" or query is None:
124
  return
125
 
126
- post_data = {"query": query, "k": num_results}
127
- if language != "detect_language":
128
- post_data["lang"] = language
129
 
130
  output = requests.post(
131
  os.environ.get("address"),
@@ -135,18 +101,10 @@ def scisearch(query, language, num_results=10):
135
  )
136
 
137
  payload = json.loads(output.text)
138
-
139
- if "err" in payload:
140
- if payload["err"]["type"] == "unsupported_lang":
141
- detected_lang = payload["err"]["meta"]["detected_lang"]
142
- return f"""
143
- <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
144
- Detected language <b>{detected_lang}</b> is not supported.<br>
145
- Please choose a language from the dropdown or type another query.
146
- </p><br><hr><br>"""
147
-
148
  results = payload["results"]
149
  highlight_terms = payload["highlight_terms"]
 
 
150
  except Exception as e:
151
  results_html = f"""
152
  <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
@@ -157,7 +115,7 @@ def scisearch(query, language, num_results=10):
157
  """
158
  print(e)
159
 
160
- return results, highlight_terms
161
 
162
 
163
  PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
@@ -176,8 +134,9 @@ def process_pii(text):
176
 
177
 
178
  def highlight_string(paragraph: str, highlight_terms: list) -> str:
179
- for term in highlight_terms:
180
- paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
 
181
  paragraph = process_pii(paragraph)
182
  return paragraph
183
 
@@ -187,16 +146,17 @@ def process_results(hits: list, highlight_terms: list) -> str:
187
  for i, hit in enumerate(hits):
188
  res_head = f"""
189
  <div class="searchresult">
190
- <h2>{i+1}. Document ID: {hit['docid']}</h2>
191
- <p>Language: <string>FIX MEEEE</string>, Score: {round(hit['score'], 2)}</p>
192
  """
193
- for subhit in hit["meta"]["docs"]:
194
- res_head += f"""
195
- <button onclick="load_image({subhit['_id']})">Load Image</button><br>
196
- <p><img id='{subhit['_id']}' src='{subhit['URL']}' style="width:400px;height:auto;display:none;"></p>
197
- <a href='{subhit['URL']}'>{subhit['URL']}</a>
198
- <p>{highlight_string(subhit['TEXT'], highlight_terms)}</p>
199
- """
 
 
200
  res_head += f"""
201
  <p>{highlight_string(hit['text'], highlight_terms)}</p>
202
  </div>
@@ -207,7 +167,7 @@ def process_results(hits: list, highlight_terms: list) -> str:
207
 
208
 
209
  if st.sidebar.button("Search"):
210
- hits, highlight_terms = scisearch(query, LANG_MAPPING[language], max_results)
211
  html_results = process_results(hits, highlight_terms)
212
  rendered_results = f"""
213
  <div id="searchresultsarea">
 
 
1
  import json
 
2
  import os
3
  import pprint
4
  import re
 
5
 
6
  import streamlit as st
7
  import streamlit.components.v1 as components
 
9
 
10
 
11
  pp = pprint.PrettyPrinter(indent=2)
12
+
13
+ os.environ["address"] = "http://34.79.83.149:8080"
14
+
15
  st.set_page_config(page_title="Gaia Search", layout="wide")
16
 
17
  os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
18
  with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
19
  file.write('[theme]\nbase="light"')
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  st.sidebar.markdown(
22
  """
23
  <style>
 
53
  )
54
 
55
  query = st.sidebar.text_input(label="Search query", value="")
56
+ corpus = st.sidebar.selectbox(
57
+ "Corpus",
58
+ ("laion", "pile", "c4"),
59
+ index=0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  )
61
  max_results = st.sidebar.slider(
62
  "Maximum Number of Results",
 
84
  st.sidebar.markdown(footer, unsafe_allow_html=True)
85
 
86
 
87
+ def scisearch(query, corpus, num_results=10):
88
  try:
89
+ print(query, corpus, num_results)
90
  query = query.strip()
91
  if query == "" or query is None:
92
  return
93
 
94
+ post_data = {"query": query, "corpus": corpus, "k": num_results}
 
 
95
 
96
  output = requests.post(
97
  os.environ.get("address"),
 
101
  )
102
 
103
  payload = json.loads(output.text)
 
 
 
 
 
 
 
 
 
 
104
  results = payload["results"]
105
  highlight_terms = payload["highlight_terms"]
106
+ return results, highlight_terms
107
+
108
  except Exception as e:
109
  results_html = f"""
110
  <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
 
115
  """
116
  print(e)
117
 
118
+
119
 
120
 
121
  PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
 
134
 
135
 
136
  def highlight_string(paragraph: str, highlight_terms: list) -> str:
137
+ # TODO:
138
+ # for term in highlight_terms:
139
+ # paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
140
  paragraph = process_pii(paragraph)
141
  return paragraph
142
 
 
146
  for i, hit in enumerate(hits):
147
  res_head = f"""
148
  <div class="searchresult">
149
+ <h2>{i+1}. Document ID: {hit['docid']}</h2>, Score: {round(hit['score'], 2)}</p>
 
150
  """
151
+ if "meta" in hit:
152
+ if hit["meta"] is not None and "docs" in hit["meta"]:
153
+ for subhit in hit["meta"]["docs"]:
154
+ res_head += f"""
155
+ <button onclick="load_image({subhit['_id']})">Load Image</button><br>
156
+ <p><img id='{subhit['_id']}' src='{subhit['URL']}' style="width:400px;height:auto;display:none;"></p>
157
+ <a href='{subhit['URL']}'>{subhit['URL']}</a>
158
+ <p>{highlight_string(subhit['TEXT'], highlight_terms)}</p>
159
+ """
160
  res_head += f"""
161
  <p>{highlight_string(hit['text'], highlight_terms)}</p>
162
  </div>
 
167
 
168
 
169
  if st.sidebar.button("Search"):
170
+ hits, highlight_terms = scisearch(query, corpus, max_results)
171
  html_results = process_results(hits, highlight_terms)
172
  rendered_results = f"""
173
  <div id="searchresultsarea">