import streamlit as st from sentence_transformers import SentenceTransformer, util from bs4 import BeautifulSoup import pandas as pd import requests import os import time def find_abstracts(soup): #df = pd.DataFrame(columns = ["identifier", "abstract"]) id_list = [] abs_list = [] title_list = [] for record in soup.find_all("csw:record"): id = record.find("dc:identifier") abs = record.find("dct:abstract") title = record.find("dc:title") # append id and abs to df #df = df.append([id.text, abs.text]) id_list.append(id.text) title_list.append(title.text) if abs != None: abs_list.append(abs.text) else: abs_list.append("NA") return id_list, title_list, abs_list def get_metadata(): # Get the abstracts from Geoportal URL = "https://www.ncei.noaa.gov/metadata/geoportal/opensearch?f=csw&from=0&size=5000&sort=title.sort" page = requests.get(URL) soup = BeautifulSoup(page.text, "lxml") id_list, title_list, abs_list = find_abstracts(soup) df = pd.DataFrame(list(zip(id_list,title_list, abs_list)), columns = ["identifier", "title", "abstract"]) df.to_csv("./ncei-metadata.csv") return df def show_model(query): path = "./ncei-metadata.csv" if os.path.exists(path): last_modified = os.path.getmtime(path) now = time.time() DAY = 86400 if (now - last_modified > DAY): df = get_metadata() else: df = pd.read_csv(path) else: df = get_metadata() # Make the abstracts the docs docs_df = df[df["abstract"] != "NA"] docs = list(docs_df["abstract"]) titles = list(docs_df["title"]) # Query query = input("Enter your query: ") # predict on a search query for data #Load the model model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1') #Encode query and documents query_emb = model.encode(query) doc_emb = model.encode(docs) #Compute dot score between query and all document embeddings scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist() #Combine docs & scores doc_score_pairs = list(zip(docs, scores, titles)) #Sort by decreasing score doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True) return doc_score_pairs def main(): st.title("Semantic Search for Datasets Using Sentence Transformers") st.write("A case study for the National Centers for Environmental Information (NCEI)") st.image("noaa_logo.png", width=150) st.write("## Goal: search for datasets in NCEI's Archive using natural language queries") st.write("[Repo](https://github.com/myrandaGoesToSpace/semantic-search-datasets)") st.image("pres-whatisnoaa.png") st.write("## The Problem Context") st.write("Uses service called OneStop for data search") st.write("**Problems:**") st.write("- Uses keyword search -- not robust to natural language queries") st.write("- Filtering options too specific for non-expert users") #st.image("pres-onestop.png") #st.image("pres-problems.png") st.write("## The Model: [Sentence Transformers](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1)") st.image("pres-sentencetransformers.png") st.write("## Project Data") st.image("pres-metadata.png") st.write("## The Process") st.image("pres-creatingse.png") st.write("## Results and Demo") st.write("[Demo Notebook](https://github.com/myrandaGoesToSpace/semantic-search-datasets/blob/main/semantic_search.ipynb)") st.image("pres-futureplans.png") st.write("## Critical Analysis") st.write("- did not run with Streamlit text input") st.write("- only embeds the first 5000 datasets") st.write("- calculates embeddings for datasets with each run") main()