ToluClassics commited on
Commit
a26c01c
β€’
1 Parent(s): 9bc7ae5

Upload 23 files

Browse files
.gitattributes CHANGED
@@ -32,3 +32,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.doc filter=lfs diff=lfs merge=lfs -text
36
+ lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.dvd filter=lfs diff=lfs merge=lfs -text
37
+ lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.pos filter=lfs diff=lfs merge=lfs -text
38
+ lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.tim filter=lfs diff=lfs merge=lfs -text
39
+ lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.fdt filter=lfs diff=lfs merge=lfs -text
40
+ lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.nvd filter=lfs diff=lfs merge=lfs -text
41
+ lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.tvd filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Miracl Arabic
3
- emoji: 🐒
4
- colorFrom: green
5
- colorTo: red
6
  sdk: streamlit
7
- sdk_version: 1.17.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
+ title: Miracl Search
3
+ emoji: πŸŒπŸ™ŒπŸŒ
4
+ colorFrom: red
5
+ colorTo: pink
6
  sdk: streamlit
7
+ sdk_version: 1.10.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client as http_client
2
+ import json
3
+ import logging
4
+ import os
5
+ import pprint
6
+ import re
7
+ import time
8
+ import string
9
+
10
+ import streamlit as st
11
+
12
+ import streamlit.components.v1 as components
13
+ from typing import Callable, Optional, Tuple, Union
14
+ from pyserini import util
15
+ from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder
16
+
17
+
18
+ VERSION = '1.0'
19
+ st.set_page_config(page_title="Miracl Search", layout="wide")
20
+
21
+ os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True)
22
+ with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file:
23
+ file.write(
24
+ '[theme]\nbase="light"'
25
+ )
26
+
27
+ Searcher = Union[FaissSearcher, LuceneSearcher]
28
+ LANG_MAPPING = {'Arabic':'ar'}
29
+
30
+ def download_indices(language):
31
+ if not os.path.exists('indices'):
32
+ os.mkdir("indices")
33
+
34
+ if not os.path.exists(f'indices/lucene-index.miracl-v1.0-{language}.20221004.2b2856'):
35
+ util.download_and_unpack_index(f"https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.miracl-v1.0-{language}.20221004.2b2856.tar.gz", "indices")
36
+
37
+
38
+
39
+ st.sidebar.markdown(
40
+ """
41
+ <style>
42
+ .aligncenter {
43
+ text-align: center;
44
+ font-weight: bold;
45
+ font-size: 30px;
46
+ }
47
+ </style>
48
+ <p class="aligncenter">MIRACL Arabic Demo</p>
49
+ <p class="aligncenter">πŸŒπŸ™ŒπŸŒ</p>
50
+ <p style="text-align: center;"> MIRACL is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.</p>
51
+ """,
52
+ unsafe_allow_html=True,
53
+ )
54
+
55
+ st.sidebar.markdown(
56
+ """
57
+ <style>
58
+ .aligncenter {
59
+ text-align: center;
60
+ }
61
+ </style>
62
+ <p style='text-align: center'>
63
+ <a href="https://github.com/project-miracl" >GitHub</a> | <a href="https://arxiv.org/abs/2210.09984" >Paper</a>
64
+ </p>
65
+ """,
66
+ unsafe_allow_html=True,
67
+ )
68
+
69
+ query = st.sidebar.text_input(label='Search query', value='')
70
+ language = st.sidebar.selectbox(
71
+ 'Language',
72
+ LANG_MAPPING.keys(),
73
+ index=3)
74
+ max_results = st.sidebar.slider(
75
+ "Maximum Number of Results",
76
+ min_value=1,
77
+ max_value=1000,
78
+ step=1,
79
+ value=10,
80
+ help="Maximum Number of Documents to return",
81
+ )
82
+
83
+
84
+ def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher):
85
+ searcher = LuceneSearcher(f'indices/lucene-index.miracl-v1.0-{language}.20221004.2b2856')
86
+ searcher.set_language(language)
87
+ if k1 is not None and b is not None:
88
+ searcher.set_bm25(k1, b)
89
+ retriever_name = f'BM25 (k1={k1}, b={b})'
90
+ else:
91
+ retriever_name = 'BM25'
92
+
93
+ return searcher
94
+
95
+ def search(query, language, num_results=10):
96
+ searcher = _load_sparse_searcher(language=LANG_MAPPING[language])
97
+
98
+ t_0 = time.time()
99
+ search_results = searcher.search(query, k=num_results)
100
+ search_time = time.time() - t_0
101
+
102
+ results_dict ={"docs": [], "doc_ids": [], "score":[], "lang": language}
103
+ for i, result in enumerate(search_results):
104
+ result = json.loads(result.raw)
105
+ results_dict["docs"].append(result["text"])
106
+ results_dict["doc_ids"].append(result["docid"])
107
+ results_dict["score"].append(search_results[i].score)
108
+
109
+ return results_dict, search_time
110
+
111
+
112
+
113
+ def highlight_string(paragraph: str, highlight_terms: list) -> str:
114
+ for term in highlight_terms:
115
+ paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
116
+ return paragraph
117
+
118
+ def process_results(hits: dict, highlight_terms: list) -> str:
119
+ hit_list = []
120
+ for i in range(len(hits['doc_ids'])):
121
+ res_head = f"""
122
+ <div class='searchresult'>
123
+ <h2>{i+1}. Document ID: {hits['doc_ids'][i]}</h2>
124
+ <p>Language: <string>{hits['lang']}</string>, Score: {round(hits['score'][i], 2)}</p>
125
+ <p>{highlight_string(hits['docs'][i], highlight_terms)}</p>
126
+ </div>
127
+ <hr>
128
+ """
129
+ hit_list.append(res_head)
130
+ return " ".join(hit_list)
131
+
132
+
133
+
134
+ if st.sidebar.button("Search"):
135
+ download_indices(LANG_MAPPING[language])
136
+ hits, search_time = search(query, language, max_results)
137
+ html_results = process_results(hits, [])
138
+ rendered_results = f"""
139
+ <div id="searchresultsarea">
140
+ <br>
141
+ <p id="searchresultsnumber">About {max_results} results</p>
142
+ {html_results}
143
+ </div>
144
+ """
145
+ st.markdown("""
146
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
147
+ integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
148
+ """,
149
+ unsafe_allow_html=True)
150
+ st.markdown(
151
+ """
152
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
153
+ """,
154
+ unsafe_allow_html=True)
155
+ st.markdown(
156
+ f"""
157
+ <div class="row no-gutters mt-3 align-items-center">
158
+ <h2> Search Results </h2>
159
+ </div>
160
+ """,
161
+ unsafe_allow_html=True)
162
+ components.html(
163
+ """
164
+ <style>
165
+ #searchresultsarea {
166
+ font-family: 'Arial';
167
+ }
168
+
169
+ #searchresultsnumber {
170
+ font-size: 0.8rem;
171
+ color: gray;
172
+ }
173
+
174
+ .searchresult h2 {
175
+ font-size: 19px;
176
+ line-height: 18px;
177
+ font-weight: normal;
178
+ color: rgb(7, 111, 222);
179
+ margin-bottom: 0px;
180
+ margin-top: 25px;
181
+ }
182
+
183
+ .searchresult a {
184
+ font-size: 12px;
185
+ line-height: 12px;
186
+ color: green;
187
+ margin-bottom: 0px;
188
+ }
189
+
190
+ .dark-mode {
191
+ color: white;
192
+ }
193
+ </style>
194
+ <script>
195
+ function load_image(id){
196
+ console.log(id)
197
+ var x = document.getElementById(id);
198
+ console.log(x)
199
+ if (x.style.display === "none") {
200
+ x.style.display = "block";
201
+ } else {
202
+ x.style.display = "none";
203
+ }
204
+ };
205
+ function myFunction() {
206
+ var element = document.body;
207
+ element.classList.toggle("dark-mode");
208
+ }
209
+ </script>
210
+ <button onclick="myFunction()">Toggle dark mode</button>
211
+ """ + rendered_results, height=800, scrolling=True
212
+ )
213
+
214
+
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.fdm ADDED
Binary file (789 Bytes). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.fdt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a3711a2b0977bc2e49928e5a113e70b64ef32f3b776875f2beea98720fccf5f
3
+ size 701613923
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.fdx ADDED
Binary file (67.3 kB). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.fnm ADDED
Binary file (340 Bytes). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.nvd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e780d2789d039a662615c248a04050f9d592f5667abf5cd77417dcf4a0f0cc75
3
+ size 2061473
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.nvm ADDED
Binary file (103 Bytes). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.si ADDED
Binary file (545 Bytes). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.tvd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e7f73815d662265e24a0992b47a1f6d8229a4631d038b336672621db9c50a6e
3
+ size 470885378
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.tvm ADDED
Binary file (3.99 kB). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5.tvx ADDED
Binary file (338 kB). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.doc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbec3544ee98c9cb4cb71857ade75f882c2ed3aafbc579888b246d18519ce4dc
3
+ size 123262995
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.dvd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cdc9fff54b5c643f716746f105539031a84cd2b3352064451f74d526cd8511b
3
+ size 21357338
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.dvm ADDED
Binary file (822 Bytes). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.pos ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30cd9ff3eae496fdc67878a4f48cf7ca47f82bbf8f9185ae1fd81bc36b45102d
3
+ size 90321986
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.tim ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94783d87515ecacbb3c85495229cbfcc502aaa71d7ea058e9330bd0682f40fc8
3
+ size 26275474
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.tip ADDED
Binary file (825 kB). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/_5_Lucene90_0.tmd ADDED
Binary file (307 Bytes). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/segments_2 ADDED
Binary file (154 Bytes). View file
 
lucene-index.miracl-v1.0-ar.20221004.2b2856/write.lock ADDED
File without changes
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ openjdk-11-jdk
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pyserini
2
+ faiss-cpu
3
+ torch