Spaces:

awacke1
/

SNOMED-LOINC-eCQM

Paused

App Files Files Community

SNOMED-LOINC-eCQM / app.py

awacke1

Update app.py

6c0ceec verified 4 months ago

raw history blame contribute delete

No virus

11.4 kB

	# import pandas_profiling as pp
	import pandas as pd
	import tensorflow as tf

	from datasets import load_dataset
	from tensorflow.python.framework import tensor_shape

	#LOINC
	datasetLOINC = load_dataset("awacke1/LOINC-CodeSet-Value-Description.csv", split="train")
	#SNOMED:
	datasetSNOMED = load_dataset("awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv", split="train")
	#eCQM:
	dataseteCQM = load_dataset("awacke1/eCQM-Code-Value-Semantic-Set.csv", split="train")

	# map using autotokenizer
	from transformers import AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
	dataset = datasetLOINC.map(lambda examples: tokenizer(examples["Description"]), batched=True)
	JSONOBJ2=dataset[0]
	print(JSONOBJ2)

	sw = datasetLOINC.filter(lambda example: example["Description"].startswith("Allergy"))
	len(sw)
	print(sw)
	print(datasetLOINC)
	print(datasetSNOMED)
	print(dataseteCQM)

	# play with some dataset tools before the show:

	#print(start_with_ar["Description"])

	#---
	#Main Stage - Begin!
	#---

	import os
	import json
	import numpy as np
	import gradio as gr

	HF_TOKEN = os.environ.get("HF_TOKEN")
	CHOICES = ["SNOMED", "LOINC", "CQM"]
	JSONOBJ = """{"items":{"item":[{"id": "0001","type": null,"is_good": false,"ppu": 0.55,"batters":{"batter":[{ "id": "1001", "type": "Regular" },{ "id": "1002", "type": "Chocolate" },{ "id": "1003", "type": "Blueberry" },{ "id": "1004", "type": "Devil's Food" }]},"topping":[{ "id": "5001", "type": "None" },{ "id": "5002", "type": "Glazed" },{ "id": "5005", "type": "Sugar" },{ "id": "5007", "type": "Powdered Sugar" },{ "id": "5006", "type": "Chocolate with Sprinkles" },{ "id": "5003", "type": "Chocolate" },{ "id": "5004", "type": "Maple" }]}]}}"""



	def concatenate_text(examples):
	return {
	"text": examples["Code"]
	+ " \n "
	+ examples["Description"]
	+ " \n "
	+ examples["Purpose: Clinical Focus"]
	}

	def cls_pooling(model_output):
	return model_output.last_hidden_state[:, 0]

	def get_embeddings(text_list):
	encoded_input = tokenizer(
	text_list, padding=True, truncation=True, return_tensors="tf"
	)
	encoded_input = {k: v for k, v in encoded_input.items()}
	model_output = model(**encoded_input)
	return cls_pooling(model_output)


	def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4,
	video, audio1, audio2, file, df1, df2,):
	#def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,):

	searchTerm = text1
	searchTermSentence = text2

	start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy')) #Allergy


	# FAISS
	columns = start_with_searchTermLOINC.column_names
	columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"]
	columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
	start_with_searchTermLOINC = start_with_searchTermLOINC.remove_columns(columns_to_remove)
	start_with_searchTermLOINC
	start_with_searchTermLOINC.set_format("pandas")
	df = start_with_searchTermLOINC[:]

	df["Purpose: Clinical Focus"][0]

	df4 = df.explode("Purpose: Clinical Focus", ignore_index=True)
	df4.head(4)

	from datasets import Dataset
	clinical_dataset = Dataset.from_pandas(df4)
	clinical_dataset

	clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())})

	clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15)
	clinical_dataset


	clinical_dataset = clinical_dataset.map(concatenate_text)
	#embedding = get_embeddings(clinical_dataset["text"][0])
	#embedding.shape

	from transformers import AutoTokenizer, TFAutoModel

	model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
	tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
	model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)

	# TensorShape([1, 768])
	tf.shape([1, 768])

	embeddings_dataset = clinical_dataset.map(
	lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]})

	# embeddings_dataset.add_faiss_index(column="embeddings")

	# question = "How can I load a dataset offline?"
	# question_embedding = get_embeddings([question]).numpy()
	# question_embedding.shape

	# scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5)

	# import pandas as pd

	# samples_df = pd.DataFrame.from_dict(samples)
	# samples_df["scores"] = scores
	# samples_df.sort_values("scores", ascending=False, inplace=True)


	# "text": examples["Code"]
	# + " \n "
	# + examples["Description"]
	# + " \n "
	# + examples["Purpose: Clinical Focus"]


	# for _, row in samples_df.iterrows():
	# print(f"Code: {row.Code}")
	# print(f"Description: {row.Description}")
	# #print(f"Purpose: Clinical Focus: {row.Purpose: Clinical Focus}")
	# #print(f"URL: {row.html_url}")
	# print("=" * 50)
	# print()

	# SNOMED and CQM ---------------
	start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital
	start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone

	print(start_with_searchTermLOINC )
	print(start_with_searchTermSNOMED )
	print(start_with_searchTermCQM)


	# try:
	#top1matchLOINC = json.loads(start_with_searchTermLOINC['train'])
	#top1matchSNOMED = json.loads(start_with_searchTermSNOMED['train'])
	#top1matchCQM = json.loads(start_with_searchTermCQM['train'])
	# top1matchLOINC = json.loads(start_with_searchTermLOINC)
	# top1matchSNOMED = json.loads(start_with_searchTermSNOMED)
	# top1matchCQM = json.loads(start_with_searchTermCQM)
	# except:
	# print('Hello')
	#print(start_with_searchTermLOINC[0])
	#print(start_with_searchTermSNOMED[0] )
	#print(start_with_searchTermCQM[0] )

	#print(returnMsg)
	# print("Datasets Processed")

	return (
	(text1 if single_checkbox else text2)
	+ ", selected:"
	+ ", ".join(checkboxes), # Text
	{
	"positive": num / (num + slider1 + slider2),
	"negative": slider1 / (num + slider1 + slider2),
	"neutral": slider2 / (num + slider1 + slider2),
	}, # Label
	(audio1[0], np.flipud(audio1[1]))
	if audio1 is not None else os.path.join(os.path.dirname(__file__), "files/cantina.wav"), # Audio
	np.flipud(im1)
	if im1 is not None else os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), # Image
	video
	if video is not None else os.path.join(os.path.dirname(__file__), "files/world.mp4"), # Video
	[
	("The", "art"),
	("quick brown", "adj"),
	("fox", "nn"),
	("jumped", "vrb"),
	("testing testing testing", None),
	("over", "prp"),
	("the", "art"),
	("testing", None),
	("lazy", "adj"),
	("dogs", "nn"),
	(".", "punc"),
	] + [(f"test {x}", f"test {x}") for x in range(10)], # HighlightedText
	[
	("The testing testing testing", None),
	("over", 0.6),
	("the", 0.2),
	("testing", None),
	("lazy", -0.1),
	("dogs", 0.4),
	(".", 0),
	] + [(f"test", x / 10) for x in range(-10, 10)], # HighlightedText
	#json.loads(JSONOBJ), # JSON
	start_with_searchTermLOINC.to_json(orient="records", path_or_buf="None"),
	#json.dumps(json.loads(start_with_searchTermLOINC['train'].to_json(orient="records", path_or_buf="None"))),
	"<button style='background-color: red'>Click Me: " + radio + "</button>", # HTML
	os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
	df1, # Dataframe
	np.random.randint(0, 10, (4, 4)), # Dataframe
	df2, # Timeseries
	)



	demo = gr.Interface(
	fn,
	inputs=[
	gr.Textbox(value="Allergy", label="Textbox"),
	gr.Textbox(lines=3, value="Bathing", placeholder="Type here..", label="Textbox 2"),
	gr.Number(label="Number", value=42),
	gr.Slider(10, 20, value=15, label="Slider: 10 - 20"),
	gr.Slider(maximum=20, step=0.04, label="Slider: step @ 0.04"),
	gr.Checkbox(label="Check for NER Match on Submit"),
	gr.CheckboxGroup(label="Clinical Terminology to Check", choices=CHOICES, value=CHOICES[0:2]),
	gr.Radio(label="Preferred Terminology Output", choices=CHOICES, value=CHOICES[2]),
	gr.Dropdown(label="Dropdown", choices=CHOICES),
	gr.Image(label="Image"),
	gr.Image(label="Image w/ Cropper"),
	gr.Image(label="Sketchpad"),
	gr.Image(label="Webcam", source="webcam"),
	gr.Video(label="Video"),
	gr.Audio(label="Audio"),
	gr.Audio(label="Microphone", source="microphone"),
	gr.File(label="File"),
	gr.Dataframe(label="Filters", headers=["Name", "Age", "Gender"]),
	gr.Timeseries(x="time", y=["price", "value"], colors=["pink", "purple"]),
	],
	outputs=[
	gr.Textbox(label="Textbox"),
	gr.Label(label="Label"),
	gr.Audio(label="Audio"),
	gr.Image(label="Image"),
	gr.Video(label="Video"),
	gr.HighlightedText(label="HighlightedText", color_map={"punc": "pink", "test 0": "blue"}),
	gr.HighlightedText(label="HighlightedText", show_legend=True),
	gr.JSON(label="JSON"),
	gr.HTML(label="HTML"),
	gr.File(label="File"),
	gr.Dataframe(label="Dataframe"),
	gr.Dataframe(label="Numpy"),
	gr.Timeseries(x="time", y=["price", "value"], label="Timeseries"),
	],
	examples=[
	[
	"Allergy",
	"Admission",
	10,
	12,
	4,
	True,
	["SNOMED", "LOINC", "CQM"],
	"SNOMED",
	"bar",
	os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
	os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
	os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
	os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
	os.path.join(os.path.dirname(__file__), "files/world.mp4"),
	os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
	os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
	os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
	[[1, 2, 3], [3, 4, 5]],
	os.path.join(os.path.dirname(__file__), "files/time.csv"),
	]
	]
	* 3,
	theme="default",
	title="⚗️🧠🔬🧬 Clinical Terminology Auto Mapper AI 👩‍⚕️🩺⚕️🙋",
	cache_examples=False,
	description="Clinical Terminology Auto Mapper AI",
	article="Learn more at [Yggdrasil](https://github.com/AaronCWacker/Yggdrasil)",
	# live=True,
	)

	if __name__ == "__main__":
	demo.launch(debug=True)