ToluClassics commited on
Commit
5277f05
β€’
1 Parent(s): adcedd4

Upload 23 files

Browse files
.gitattributes CHANGED
@@ -32,3 +32,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.doc filter=lfs diff=lfs merge=lfs -text
36
+ lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.dvd filter=lfs diff=lfs merge=lfs -text
37
+ lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.pos filter=lfs diff=lfs merge=lfs -text
38
+ lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.tim filter=lfs diff=lfs merge=lfs -text
39
+ lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.fdt filter=lfs diff=lfs merge=lfs -text
40
+ lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.tvd filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Miracl Search - Bengali
3
- emoji: ⚑
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: streamlit
7
- sdk_version: 1.17.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
+ title: Miracl Search
3
+ emoji: πŸŒπŸ™ŒπŸŒ
4
+ colorFrom: red
5
+ colorTo: pink
6
  sdk: streamlit
7
+ sdk_version: 1.10.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client as http_client
2
+ import json
3
+ import logging
4
+ import os
5
+ import pprint
6
+ import re
7
+ import time
8
+ import string
9
+
10
+ import streamlit as st
11
+
12
+ import streamlit.components.v1 as components
13
+ from typing import Callable, Optional, Tuple, Union
14
+ from pyserini import util
15
+ from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder
16
+
17
+
18
+ VERSION = '1.0'
19
+ st.set_page_config(page_title="Miracl Search - Bengali", layout="wide")
20
+
21
+ os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True)
22
+ with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file:
23
+ file.write(
24
+ '[theme]\nbase="light"'
25
+ )
26
+
27
+ Searcher = Union[FaissSearcher, LuceneSearcher]
28
+ LANG_MAPPING = {'Bengali':'bn'}
29
+
30
+
31
+ st.sidebar.markdown(
32
+ """
33
+ <style>
34
+ .aligncenter {
35
+ text-align: center;
36
+ font-weight: bold;
37
+ font-size: 30px;
38
+ }
39
+ </style>
40
+ <p class="aligncenter">MIRACL Bengali Demo</p>
41
+ <p class="aligncenter">πŸŒπŸ™ŒπŸŒ</p>
42
+ <p style="text-align: center;"> MIRACL is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.</p>
43
+ """,
44
+ unsafe_allow_html=True,
45
+ )
46
+
47
+ st.sidebar.markdown(
48
+ """
49
+ <style>
50
+ .aligncenter {
51
+ text-align: center;
52
+ }
53
+ </style>
54
+ <p style='text-align: center'>
55
+ <a href="https://github.com/project-miracl" >GitHub</a> | <a href="https://arxiv.org/abs/2210.09984" >Paper</a>
56
+ </p>
57
+ """,
58
+ unsafe_allow_html=True,
59
+ )
60
+
61
+ query = st.sidebar.text_input(label='Search query', value='')
62
+ language = 'Bengali'
63
+
64
+ max_results = st.sidebar.slider(
65
+ "Maximum Number of Results",
66
+ min_value=1,
67
+ max_value=1000,
68
+ step=1,
69
+ value=10,
70
+ help="Maximum Number of Documents to return",
71
+ )
72
+
73
+
74
+ def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher):
75
+ searcher = LuceneSearcher(f'lucene-index.miracl-v1.0-{language}.20221004.2b2856')
76
+ searcher.set_language(language)
77
+ if k1 is not None and b is not None:
78
+ searcher.set_bm25(k1, b)
79
+ retriever_name = f'BM25 (k1={k1}, b={b})'
80
+ else:
81
+ retriever_name = 'BM25'
82
+
83
+ return searcher
84
+
85
+ def search(query, language, num_results=10):
86
+ searcher = _load_sparse_searcher(language=LANG_MAPPING[language])
87
+
88
+ t_0 = time.time()
89
+ search_results = searcher.search(query, k=num_results)
90
+ search_time = time.time() - t_0
91
+
92
+ results_dict ={"docs": [], "doc_ids": [], "score":[], "lang": language}
93
+ for i, result in enumerate(search_results):
94
+ result = json.loads(result.raw)
95
+ results_dict["docs"].append(result["text"])
96
+ results_dict["doc_ids"].append(result["docid"])
97
+ results_dict["score"].append(search_results[i].score)
98
+
99
+ return results_dict, search_time
100
+
101
+
102
+
103
+ def highlight_string(paragraph: str, highlight_terms: list) -> str:
104
+ for term in highlight_terms:
105
+ paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
106
+ return paragraph
107
+
108
+ def process_results(hits: dict, highlight_terms: list) -> str:
109
+ hit_list = []
110
+ for i in range(len(hits['doc_ids'])):
111
+ res_head = f"""
112
+ <div class='searchresult'>
113
+ <h2>{i+1}. Document ID: {hits['doc_ids'][i]}</h2>
114
+ <p>Language: <string>{hits['lang']}</string>, Score: {round(hits['score'][i], 2)}</p>
115
+ <p>{highlight_string(hits['docs'][i], highlight_terms)}</p>
116
+ </div>
117
+ <hr>
118
+ """
119
+ hit_list.append(res_head)
120
+ return " ".join(hit_list)
121
+
122
+
123
+
124
+ if st.sidebar.button("Search"):
125
+ hits, search_time = search(query, language, max_results)
126
+ html_results = process_results(hits, [])
127
+ rendered_results = f"""
128
+ <div id="searchresultsarea">
129
+ <br>
130
+ <p id="searchresultsnumber">About {max_results} results</p>
131
+ {html_results}
132
+ </div>
133
+ """
134
+ st.markdown("""
135
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
136
+ integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
137
+ """,
138
+ unsafe_allow_html=True)
139
+ st.markdown(
140
+ """
141
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
142
+ """,
143
+ unsafe_allow_html=True)
144
+ st.markdown(
145
+ f"""
146
+ <div class="row no-gutters mt-3 align-items-center">
147
+ <h2> Search Results </h2>
148
+ </div>
149
+ """,
150
+ unsafe_allow_html=True)
151
+ components.html(
152
+ """
153
+ <style>
154
+ #searchresultsarea {
155
+ font-family: 'Arial';
156
+ }
157
+
158
+ #searchresultsnumber {
159
+ font-size: 0.8rem;
160
+ color: gray;
161
+ }
162
+
163
+ .searchresult h2 {
164
+ font-size: 19px;
165
+ line-height: 18px;
166
+ font-weight: normal;
167
+ color: rgb(7, 111, 222);
168
+ margin-bottom: 0px;
169
+ margin-top: 25px;
170
+ }
171
+
172
+ .searchresult a {
173
+ font-size: 12px;
174
+ line-height: 12px;
175
+ color: green;
176
+ margin-bottom: 0px;
177
+ }
178
+
179
+ .dark-mode {
180
+ color: white;
181
+ }
182
+ </style>
183
+ <script>
184
+ function load_image(id){
185
+ console.log(id)
186
+ var x = document.getElementById(id);
187
+ console.log(x)
188
+ if (x.style.display === "none") {
189
+ x.style.display = "block";
190
+ } else {
191
+ x.style.display = "none";
192
+ }
193
+ };
194
+ function myFunction() {
195
+ var element = document.body;
196
+ element.classList.toggle("dark-mode");
197
+ }
198
+ </script>
199
+ <button onclick="myFunction()">Toggle dark mode</button>
200
+ """ + rendered_results, height=800, scrolling=True
201
+ )
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.fdm ADDED
Binary file (284 Bytes). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.fdt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da8c07667ca7c4b80c6e2f27b6841bde38e98b2f77bd6ba8ebae6f24431a7de1
3
+ size 142530944
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.fdx ADDED
Binary file (16.3 kB). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.fnm ADDED
Binary file (340 Bytes). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.nvd ADDED
Binary file (297 kB). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.nvm ADDED
Binary file (103 Bytes). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.si ADDED
Binary file (509 Bytes). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.tvd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d852c9f32bc07bcdcf529989adbbb942f32ed41184f15b2b4360b417e61c891
3
+ size 102577215
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.tvm ADDED
Binary file (1.21 kB). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0.tvx ADDED
Binary file (90.3 kB). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.doc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb9d967ae8cb98c78f2b49fe2e124f93befca12ebbafa2ce379ca2a0bf99acbf
3
+ size 18965534
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.dvd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cb27fe20fbeab3424704e3e362068e77017e66dd71ba2cb9e776e57446e4b18
3
+ size 2841303
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.dvm ADDED
Binary file (255 Bytes). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.pos ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7742570f4594f271ef3305678870f08d29837b59bd4aadf3e3552069498a753f
3
+ size 14588563
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.tim ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:551d897f92450a25e0819c4d27455f96c6e95ced78a73c064803d464950758c4
3
+ size 7926497
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.tip ADDED
Binary file (233 kB). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/_0_Lucene90_0.tmd ADDED
Binary file (303 Bytes). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/segments_1 ADDED
Binary file (154 Bytes). View file
 
lucene-index.miracl-v1.0-bn.20221004.2b2856/write.lock ADDED
File without changes
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ openjdk-11-jdk
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pyserini
2
+ faiss-cpu
3
+ torch