Spaces:

librarian-bots
/

new-datasets-in-machine-learning

Running

App Files Files Community

davanstrien HF staff commited on Oct 11, 2023

Commit

a38b615

•

1 Parent(s): e878844

add better retries

Browse files

Files changed (3) hide show

app.py +36 -21
requirements.in +2 -1
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -7,36 +7,28 @@ from apscheduler.schedulers.background import BackgroundScheduler
 from cachetools import TTLCache, cached
 from setfit import SetFitModel
 from tqdm.auto import tqdm
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 CACHE_TIME = 60 * 60 * 12  # 12 hours
-MAX_RESULTS = 200
-# def list_cacheable(func: Callable[..., Any]) -> Callable[..., Any]:
-#     @lru_cache(maxsize=100)
-#     def cacheable_func(*args: Any, **kwargs: Any) -> Any:
-#         return func(*args, **kwargs)
-#     @wraps(func)
-#     def wrapper(*args: Any, **kwargs: Any) -> Any:
-#         # Convert lists to tuples to make them hashable
-#         args = tuple(tuple(arg) if isinstance(arg, list) else arg for arg in args)
-#         kwargs = {k: tuple(v) if isinstance(v, list) else v for k, v in kwargs.items()}
-#         return cacheable_func(*args, **kwargs)
-#     return wrapper
 @cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
 def get_arxiv_result():
-    search = arxiv.Search(
-        query="ti:dataset AND abs:machine learning",
-        max_results=MAX_RESULTS,
-        sort_by=arxiv.SortCriterion.SubmittedDate,
-    )
-    return [
         {
             "title": result.title,
             "abstract": result.summary,
@@ -44,8 +36,31 @@ def get_arxiv_result():
             "category": result.primary_category,
             "updated": result.updated,
         }
-        for result in tqdm(search.results(), total=MAX_RESULTS)
     ]
 def load_model():

 from cachetools import TTLCache, cached
 from setfit import SetFitModel
 from tqdm.auto import tqdm
+import stamina
+from arxiv import UnexpectedEmptyPageError, ArxivError
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 CACHE_TIME = 60 * 60 * 12  # 12 hours
+MAX_RESULTS = 300
+client = arxiv.Client(page_size=50, delay_seconds=3, num_retries=2)
 @cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
 def get_arxiv_result():
+    return _get_arxiv_result()
+@stamina.retry(
+    on=(ValueError, UnexpectedEmptyPageError, ArxivError), attempts=10, wait_max=60 * 15
+)
+def _get_arxiv_result():
+    results = [
         {
             "title": result.title,
             "abstract": result.summary,
             "category": result.primary_category,
             "updated": result.updated,
         }
+        for result in tqdm(
+            client.results(
+                arxiv.Search(
+                    query="ti:dataset",
+                    max_results=MAX_RESULTS,
+                    sort_by=arxiv.SortCriterion.SubmittedDate,
+                )
+            ),
+            total=MAX_RESULTS,
+        )
     ]
+    if len(results) > 1:
+        return results
+    else:
+        raise ValueError("No results found")
+    # return [
+    #     {
+    #         "title": result.title,
+    #         "abstract": result.summary,
+    #         "url": result.entry_id,
+    #         "category": result.primary_category,
+    #         "updated": result.updated,
+    #     }
+    #     for result in tqdm(search.results(), total=MAX_RESULTS)
+    # ]
 def load_model():

requirements.in CHANGED Viewed

@@ -4,4 +4,5 @@ cachetools
 gradio
 hf-transfer
 scikit-learn==1.2.2
-setfit

 gradio
 hf-transfer
 scikit-learn==1.2.2
+setfit
+stamina

requirements.txt CHANGED Viewed

@@ -274,10 +274,14 @@ sniffio==1.3.0
     #   anyio
     #   httpcore
     #   httpx
 starlette==0.27.0
     # via fastapi
 sympy==1.12
     # via torch
 threadpoolctl==3.2.0
     # via scikit-learn
 tokenizers==0.14.0

     #   anyio
     #   httpcore
     #   httpx
+stamina==23.1.0
+    # via -r requirements.in
 starlette==0.27.0
     # via fastapi
 sympy==1.12
     # via torch
+tenacity==8.2.3
+    # via stamina
 threadpoolctl==3.2.0
     # via scikit-learn
 tokenizers==0.14.0