Spaces:

ashishgargcse
/

ClinicalTerminologyUIUX-GR

Build error

App Files Files Community

ashishgargcse commited on Nov 3, 2022

Commit

b131a28

•

1 Parent(s): a61185c

Upload 3 files

Browse files

Files changed (3) hide show

README.txt +13 -0
app.py +327 -0
requirements.txt +7 -0

README.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: 🧬CTMap - Clinical Terminology AutoMap AI
+emoji: ⚗️🧠🔬🧬
+colorFrom: yellow
+colorTo: green
+sdk: gradio
+sdk_version: 3.5
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import pandas_profiling as pp
+import pandas as pd
+import tensorflow as tf
+from datasets import load_dataset
+from tensorflow.python.framework import tensor_shape
+#LOINC
+datasetLOINC = load_dataset("awacke1/LOINC-CodeSet-Value-Description.csv", split="train")
+#SNOMED:
+datasetSNOMED = load_dataset("awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv", split="train")
+#eCQM:
+dataseteCQM = load_dataset("awacke1/eCQM-Code-Value-Semantic-Set.csv", split="train")
+# map using autotokenizer
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+dataset = datasetLOINC.map(lambda examples: tokenizer(examples["Description"]), batched=True)
+JSONOBJ2=dataset[0]
+print(JSONOBJ2)
+sw = datasetLOINC.filter(lambda example: example["Description"].startswith("Allergy"))
+len(sw)
+print(sw)
+print(datasetLOINC)
+print(datasetSNOMED)
+print(dataseteCQM)
+# play with some dataset tools before the show:
+#print(start_with_ar["Description"])
+#---
+#Main Stage - Begin!
+#---
+import os
+import json
+import numpy as np
+import gradio as gr
+HF_TOKEN = os.environ.get("HF_TOKEN")
+CHOICES = ["SNOMED", "LOINC", "CQM"]
+JSONOBJ = """{"items":{"item":[{"id": "0001","type": null,"is_good": false,"ppu": 0.55,"batters":{"batter":[{ "id": "1001", "type": "Regular" },{ "id": "1002", "type": "Chocolate" },{ "id": "1003", "type": "Blueberry" },{ "id": "1004", "type": "Devil's Food" }]},"topping":[{ "id": "5001", "type": "None" },{ "id": "5002", "type": "Glazed" },{ "id": "5005", "type": "Sugar" },{ "id": "5007", "type": "Powdered Sugar" },{ "id": "5006", "type": "Chocolate with Sprinkles" },{ "id": "5003", "type": "Chocolate" },{ "id": "5004", "type": "Maple" }]}]}}"""
+def profile_dataset(dataset=datasetSNOMED, username="awacke1", token=HF_TOKEN, dataset_name="awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv"):
+    df = pd.read_csv(dataset.Description)
+    if len(df.columns) <= 15:
+        profile = pp.ProfileReport(df, title=f"{dataset_name} Report")
+    else:
+        profile = pp.ProfileReport(df, title=f"{dataset_name} Report", minimal = True)
+    repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
+    profile.to_file("./index.html")
+    upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
+    readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
+    with open("README.md", "w+") as f:
+        f.write(readme)
+    upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
+    return f"Your dataset report will be ready at {repo_url}"
+#def lowercase_title(example):
+#    return {"Description": example[title].lower()}
+# demonstrate map function of dataset
+#JSONOBJ_MAP=datasetLOINC.map(lowercase_title)
+#JSONOBJ_MAP=datasetLOINC.filter(lambda example: example["Description"].startswith("Mental health"))
+def concatenate_text(examples):
+    return {
+        "text": examples["Code"]
+        + " \n "
+        + examples["Description"]
+        + " \n "
+        + examples["Purpose: Clinical Focus"]
+    }
+def cls_pooling(model_output):
+    return model_output.last_hidden_state[:, 0]
+def get_embeddings(text_list):
+    encoded_input = tokenizer(
+        text_list, padding=True, truncation=True, return_tensors="tf"
+    )
+    encoded_input = {k: v for k, v in encoded_input.items()}
+    model_output = model(**encoded_input)
+    return cls_pooling(model_output)
+def fn(    text1,    text2,    num,    slider1,    slider2,    single_checkbox,    checkboxes,    radio,    dropdown,    im1,    im2,    im3,    im4,
+    video,    audio1,    audio2,    file,    df1,    df2,):
+#def fn(    text1,    text2,    single_checkbox,    checkboxes,    radio,    im4,  file,    df1,    df2,):
+    searchTerm = text1
+    searchTermSentence = text2
+    start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy'))    #Allergy
+    # FAISS
+    columns = start_with_searchTermLOINC.column_names
+    columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"]
+    columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
+    start_with_searchTermLOINC = start_with_searchTermLOINC.remove_columns(columns_to_remove)
+    start_with_searchTermLOINC
+    start_with_searchTermLOINC.set_format("pandas")
+    df = start_with_searchTermLOINC[:]
+    df["Purpose: Clinical Focus"][0]
+    df4 = df.explode("Purpose: Clinical Focus", ignore_index=True)
+    df4.head(4)
+    from datasets import Dataset
+    clinical_dataset = Dataset.from_pandas(df4)
+    clinical_dataset
+    clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())})
+    clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15)
+    clinical_dataset
+    clinical_dataset = clinical_dataset.map(concatenate_text)
+    #embedding = get_embeddings(clinical_dataset["text"][0])
+    #embedding.shape
+    from transformers import AutoTokenizer, TFAutoModel
+    model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
+    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+    model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)
+#    TensorShape([1, 768])
+    tf.shape([1, 768])
+    embeddings_dataset = clinical_dataset.map(
+    lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]})
+#    embeddings_dataset.add_faiss_index(column="embeddings")
+#    question = "How can I load a dataset offline?"
+#    question_embedding = get_embeddings([question]).numpy()
+#    question_embedding.shape
+#    scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5)
+#    import pandas as pd
+#    samples_df = pd.DataFrame.from_dict(samples)
+#    samples_df["scores"] = scores
+#    samples_df.sort_values("scores", ascending=False, inplace=True)
+    #        "text": examples["Code"]
+    #    + " \n "
+    #    + examples["Description"]
+    #    + " \n "
+    #    + examples["Purpose: Clinical Focus"]
+#    for _, row in samples_df.iterrows():
+#        print(f"Code: {row.Code}")
+#        print(f"Description: {row.Description}")
+#        #print(f"Purpose: Clinical Focus: {row.Purpose: Clinical Focus}")
+#        #print(f"URL: {row.html_url}")
+#        print("=" * 50)
+#        print()
+    # SNOMED and CQM ---------------
+    start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital'))    #Hospital
+    start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone'))    #Telephone
+    print(start_with_searchTermLOINC )
+    print(start_with_searchTermSNOMED )
+    print(start_with_searchTermCQM)
+    #print(start_with_searchTermLOINC["train"][0] )
+    #print(start_with_searchTermSNOMED["train"][0] )
+    #print(start_with_searchTermCQM["train"][0] )
+    #returnMsg=profile_dataset()
+    #print(returnMsg)
+#    try:
+        #top1matchLOINC = json.loads(start_with_searchTermLOINC['train'])
+        #top1matchSNOMED = json.loads(start_with_searchTermSNOMED['train'])
+        #top1matchCQM = json.loads(start_with_searchTermCQM['train'])
+#        top1matchLOINC = json.loads(start_with_searchTermLOINC)
+#        top1matchSNOMED = json.loads(start_with_searchTermSNOMED)
+#        top1matchCQM = json.loads(start_with_searchTermCQM)
+#    except:
+#        print('Hello')
+        #print(start_with_searchTermLOINC[0])
+        #print(start_with_searchTermSNOMED[0] )
+        #print(start_with_searchTermCQM[0] )
+    #print(returnMsg)
+ #   print("Datasets Processed")
+    return (
+        (text1 if single_checkbox else text2)
+        + ", selected:"
+        + ", ".join(checkboxes),  # Text
+        {
+            "positive": num / (num + slider1 + slider2),
+            "negative": slider1 / (num + slider1 + slider2),
+            "neutral": slider2 / (num + slider1 + slider2),
+        },  # Label
+        (audio1[0], np.flipud(audio1[1]))
+        if audio1 is not None  else os.path.join(os.path.dirname(__file__), "files/cantina.wav"),  # Audio
+        np.flipud(im1)
+        if im1 is not None  else os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),  # Image
+        video
+        if video is not None  else os.path.join(os.path.dirname(__file__), "files/world.mp4"),  # Video
+        [
+            ("The", "art"),
+            ("quick brown", "adj"),
+            ("fox", "nn"),
+            ("jumped", "vrb"),
+            ("testing testing testing", None),
+            ("over", "prp"),
+            ("the", "art"),
+            ("testing", None),
+            ("lazy", "adj"),
+            ("dogs", "nn"),
+            (".", "punc"),
+        ]   + [(f"test {x}", f"test {x}") for x in range(10)],  # HighlightedText
+        [
+            ("The testing testing testing", None),
+            ("over", 0.6),
+            ("the", 0.2),
+            ("testing", None),
+            ("lazy", -0.1),
+            ("dogs", 0.4),
+            (".", 0),
+        ]   + [(f"test", x / 10) for x in range(-10, 10)],  # HighlightedText
+        #json.loads(JSONOBJ),  # JSON
+        start_with_searchTermLOINC.to_json(orient="records", path_or_buf="None"),
+        #json.dumps(json.loads(start_with_searchTermLOINC['train'].to_json(orient="records", path_or_buf="None"))),
+        "<button style='background-color: red'>Click Me: " + radio + "</button>",  # HTML
+        os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
+        df1,  # Dataframe
+        np.random.randint(0, 10, (4, 4)),  # Dataframe
+        df2,  # Timeseries
+    )
+demo = gr.Interface(
+    fn,
+    inputs=[
+        gr.Textbox(value="Allergy", label="Textbox"),
+        gr.Textbox(lines=3, value="Bathing", placeholder="Type here..", label="Textbox 2"),
+        gr.Number(label="Number", value=42),
+        gr.Slider(10, 20, value=15, label="Slider: 10 - 20"),
+        gr.Slider(maximum=20, step=0.04, label="Slider: step @ 0.04"),
+        gr.Checkbox(label="Check for NER Match on Submit"),
+        gr.CheckboxGroup(label="Clinical Terminology to Check", choices=CHOICES, value=CHOICES[0:2]),
+        gr.Radio(label="Preferred Terminology Output", choices=CHOICES, value=CHOICES[2]),
+        gr.Dropdown(label="Dropdown", choices=CHOICES),
+        gr.Image(label="Image"),
+        gr.Image(label="Image w/ Cropper", tool="select"),
+        gr.Image(label="Sketchpad", source="canvas"),
+        gr.Image(label="Webcam", source="webcam"),
+        gr.Video(label="Video"),
+        gr.Audio(label="Audio"),
+        gr.Audio(label="Microphone", source="microphone"),
+        gr.File(label="File"),
+        gr.Dataframe(label="Filters", headers=["Name", "Age", "Gender"]),
+        gr.Timeseries(x="time", y=["price", "value"], colors=["pink", "purple"]),
+    ],
+    outputs=[
+        gr.Textbox(label="Textbox"),
+        gr.Label(label="Label"),
+        gr.Audio(label="Audio"),
+        gr.Image(label="Image"),
+        gr.Video(label="Video"),
+        gr.HighlightedText(label="HighlightedText", color_map={"punc": "pink", "test 0": "blue"}),
+        gr.HighlightedText(label="HighlightedText", show_legend=True),
+        gr.JSON(label="JSON"),
+        gr.HTML(label="HTML"),
+        gr.File(label="File"),
+        gr.Dataframe(label="Dataframe"),
+        gr.Dataframe(label="Numpy"),
+        gr.Timeseries(x="time", y=["price", "value"], label="Timeseries"),
+    ],
+    examples=[
+        [
+            "Allergy",
+            "Admission",
+            10,
+            12,
+            4,
+            True,
+            ["SNOMED", "LOINC", "CQM"],
+            "SNOMED",
+            "bar",
+            os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
+            os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
+            os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
+            os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
+            os.path.join(os.path.dirname(__file__), "files/world.mp4"),
+            os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
+            os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
+            os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
+            [[1, 2, 3], [3, 4, 5]],
+            os.path.join(os.path.dirname(__file__), "files/time.csv"),
+        ]
+    ]
+    * 3,
+    theme="default",
+    title="⚗️🧠🔬🧬 Clinical Terminology Auto Mapper AI 👩‍⚕️🩺⚕️🙋",
+    cache_examples=False,
+    description="Clinical Terminology Auto Mapper AI",
+    article="Learn more at [Yggdrasil](https://github.com/AaronCWacker/Yggdrasil)",
+#    live=True,
+)
+if __name__ == "__main__":
+    demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+datasets
+transformers
+pandas-profiling
+huggingface-hub
+gradio
+Tensorflow
+torch