“Chat with your data” for medical image dataset

Radiological image datasets are invaluable for medical research, but their utility is often limited by the lack of descriptive metadata that enables intuitive querying and exploration. Transforming such datasets into multimodal collections by integrating image descriptions, derived from open-source large language models (LLMs), bridges this gap. This approach empowers researchers and practitioners to search for and analyze content using natural language, making the data more accessible, searchable, and valuable for downstream tasks such as diagnosis, education, and cross-disciplinary collaboration.

Let’s go

!pip install --upgrade pip
!pip install --upgrade transformers accelerate
!pip install sentencepiece bitsandbytes pillow scipy deeplake mlcroissant

Restart kernel

# Make sure to restart Colab runtime after installing dependencies
import os
try:
    import google.colab
    os._exit(0)
except ImportError:
    pass

Load libraries

from datetime import datetime
import deeplake
from google.colab import userdata
import mlcroissant as mlc
import multiprocessing
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import os
from google.colab import userdata
from transformers import AutoProcessor, LlavaForConditionalGeneration
import torch
from PIL import Image
from transformers import BitsAndBytesConfig, pipeline

Create croissant file for our example

import json

croissant_details = {
  "@context": {
    "@language": "en",
    "@vocab": "https://schema.org/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "cr": "http://mlcommons.org/croissant/",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "dataBiases": "cr:dataBiases",
    "dataCollection": "cr:dataCollection",
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "dct": "http://purl.org/dc/terms/",
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isEnumeration": "cr:isEnumeration",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
    "md5": "cr:md5",
    "parentField": "cr:parentField",
    "path": "cr:path",
    "personalSensitiveInformation": "cr:personalSensitiveInformation",
    "recordSet": "cr:recordSet",
    "references": "cr:references",
    "regex": "cr:regex",
    "repeated": "cr:repeated",
    "replace": "cr:replace",
    "sc": "https://schema.org/",
    "separator": "cr:separator",
    "source": "cr:source",
    "subField": "cr:subField",
    "transform": "cr:transform",
    "wd": "https://www.wikidata.org/wiki/"
  },
  "alternateName": "",
  "conformsTo": "http://mlcommons.org/croissant/1.0",
  "license": {
    "@type": "sc:CreativeWork",
    "name": "Database: Open Database, Contents: Database Contents",
    "url": "http://opendatacommons.org/licenses/dbcl/1.0/"
  },
  "distribution": [
    {
      "contentUrl": "https://www.kaggle.com/api/v1/datasets/download/diayruldip/carinocroma?datasetVersionNumber=1",
      "contentSize": "118.616 MB",
      "md5": "PwaKTf5UOILk7VreP/ZGNQ==",
      "encodingFormat": "application/zip",
      "@id": "archive.zip",
      "@type": "cr:FileObject",
      "name": "archive.zip",
      "description": "Archive containing all the contents of the Chest CT Scan Image Lung dataset"
    },
    {
      "includes": "*.txt",
      "containedIn": {
        "@id": "archive.zip"
      },
      "encodingFormat": "text/txt",
      "@id": "image_labels",
      "@type": "cr:FileSet",
      "name": "image_labels"
    },
    {
      "includes": "*.**g",
      "containedIn": {
        "@id": "archive.zip"
      },
      "encodingFormat": "image/jpeg",
      "@id": "image-files",
      "@type": "cr:FileSet",
      "name": "image/jpeg files",
      "description": "image/jpeg files contained in archive.zip"
    }
  ],
  "recordSet": [
    {
      "@type": "cr:RecordSet",
      "@id": "images",
      "name": "images",
      "key": {
        "@id": "img_id"
      },
      "field": [
        {
          "@type": "cr:Field",
          "@id": "images/image_filename",
          "name": "images/image_filename",
          "description": "The filename of the image. eg: COCO_train2014_000000000003.jpg",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "image-files"
            },
            "extract": {
              "fileProperty": "filename"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "images/image_content",
          "name": "images/image_content",
          "description": "The content of the image.",
          "dataType": "sc:ImageObject",
          "source": {
            "fileSet": {
              "@id": "image-files"
            },
            "extract": {
              "fileProperty": "content"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "images/split",
          "name": "images/split",
          "dataType": [
            "sc:Text"
          ],
          "source": {
            "fileSet": {
              "@id": "image-files"
            },
            "extract": {
              "fileProperty": "fullpath"
            },
            "transform": {
              "regex": "^.*/(train|valid|test).*$"
            }
          }
        },
                {
          "@type": "cr:Field",
          "@id": "images/label",
          "name": "images/label",
          "dataType": [
            "sc:Text"
          ],
          "source": {
            "fileSet": {
              "@id": "image-files"
            },
            "extract": {
              "fileProperty": "fullpath"
            },
            "transform": {
              "regex": "^.*/(.*)/.*..*$"
            }
          }
        }
      ]
    }
  ],
  "version": 1,
  "keywords": [
    "subject > health and fitness > health > health conditions > cancer"
  ],
  "isAccessibleForFree": "true",
  "includedInDataCatalog": {
    "@type": "sc:DataCatalog",
    "name": "Kaggle",
    "url": "https://www.kaggle.com"
  },
  "creator": {
    "@type": "sc:Person",
    "name": "Diayrul Dip",
    "url": "/diayruldip",
    "image": "https://storage.googleapis.com/kaggle-avatars/thumbnails/default-thumb.png"
  },
  "publisher": {
    "@type": "sc:Organization",
    "name": "Kaggle",
    "url": "https://www.kaggle.com/organizations/kaggle",
    "image": "https://storage.googleapis.com/kaggle-organizations/4/thumbnail.png"
  },
  "thumbnailUrl": "https://storage.googleapis.com/kaggle-datasets-images/new-version-temp-images/default-backgrounds-45.png-10561992/dataset-card.png",
  "dateModified": "2022-05-17T06:09:27.707",
  "datePublished": "2022-05-17T06:09:27.707",
  "@type": "sc:Dataset",
  "name": "Chest CT Scan Image Lung",
  "url": "https://www.kaggle.com/datasets/diayruldip/carinocroma/versions/1",
  "description": "\n**Data**\nImages are not in dcm format, the images are in jpg or png to fit the model\nData contain 3 chest cancer types which are Adenocarcinoma,Large cell carcinoma, Squamous cell carcinoma , and 1 folder for the normal cell\nData folder is the main folder that contain all the step folders\ninside Data folder are test , train , valid\n\ntest represent testing set\ntrain represent training set\nvalid represent validation set\ntraining set is 70%\ntesting set is 20%\nvalidation set is 10%\n\n**Adenocarcinoma**\nAdenocarcinoma of the lung: Lung adenocarcinoma is the most common form of lung cancer\naccounting for 30 percent of all cases overall and about 40 percent\nof all non-small cell lung cancer occurrences. Adenocarcinomas are\nfound in several common cancers, including breast, prostate and colorectal.\nAdenocarcinomas of the lung are found in the outer region of the lung\nin glands that secrete mucus and help us breathe.\nSymptoms include coughing, hoarseness, weight loss and weakness.\n\n**Large cell carcinoma**\nLarge-cell undifferentiated carcinoma: Large-cell undifferentiated carcinoma lung cancer grows and spreads quickly and can\nbe found anywhere in the lung. This type of lung cancer usually accounts for 10\nto 15 percent of all cases of NSCLC.\nLarge-cell undifferentiated carcinoma tends to grow and spread quickly.\n**\nSquamous cell carcinoma**\nSquamous cell: This type of lung cancer is found centrally in the lung,\nwhere the larger bronchi join the trachea to the lung,\nor in one of the main airway branches.\nSquamous cell lung cancer is responsible for about 30 percent of all non-small\ncell lung cancers, and is generally linked to smoking.\n\nAnd the last folder is the normal CT-Scan images\n\n**Acknowledgements**\nWe wouldn't be here without the help of others and the resources we found.\nthanks for all of my team and the people who supported us\n\nInspiration\nI want to hear all your feedback"
}

with open("chest_ct_kaggle.json", "w") as f:
    json.dump(croissant_details, f, indent=4)

print("chest_ct_kaggle.json created successfully!")

dataset = 'chest_ct'
org_id = <you_org> # CHANGE THIS ACCORDING TO YOU ORG ON https://app.activeloop.ai/
path_to_deeplake_db = f'al://{org_id}/{dataset}'

path_to_croissant_file = '/content/chest_ct_kaggle.json'

dataset = mlc.Dataset(jsonld=path_to_croissant_file)
metadata = dataset.metadata.to_json()

Using similar code to Chapter 1, download data defined in croissant.json and save to Deeplake object

records_loaded = dataset.records(record_set="images")
print("number of images in the dataset: {}".format(len(list(records_loaded))))

for i, record in enumerate(records_loaded):
  print("index {} is file \"{}\" with label {} in split {}".format(i, record['images/image_filename'].decode("utf-8"), record['images/label'].decode("utf-8"), record['images/split'].decode("utf-8")))
  if i > 10:
    break

Get an API token from https://app.activeloop.ai/ and add as a secret to the Colab secret manager. Name it ACTIVELOOP_TOKEN

The code below takes the Croissant 🥐 file meant for comprehensive data sharing and turns into a general-purpose Deep Lake object. The best of both worlds 💪!

start_time = datetime.now()

try:
    deeplake.delete(path_to_deeplake_db, token=userdata.get('ACTIVELOOP_TOKEN'))
except Exception as e:
    print(f"Could not delete dataset {path_to_deeplake_db}: {e}")
    pass

ds = deeplake.create(path_to_deeplake_db, token = userdata.get('ACTIVELOOP_TOKEN'))

num_cpu = multiprocessing.cpu_count()
print ("Processing with {} cpus".format(num_cpu))

for key in metadata:
  if key == 'recordSet': continue
  print(f"Adding Croissant metadata to deeplake DB: {key}")
  croissant_obj = metadata[key]
  if isinstance(croissant_obj, datetime):
    croissant_obj = croissant_obj.strftime("%Y-%m-%d %H:%M:%S.%f")
  ds.metadata[key] = croissant_obj

record_sets = ", ".join([f"`{rs.id}`" for rs in dataset.metadata.record_sets])

record_sets = [f"{rs.id}" for rs in dataset.metadata.record_sets]

for i in record_sets:
  ds.add_column("record_set", "text")
  ds.add_column("filename", "text")
  ds.add_column("split", "text")
  ds.add_column("label", "text")
  ds.add_column("description", "text")
  ds.add_column("embedding", dtype=deeplake.types.Embedding(1024))
  ds.add_column("image", deeplake.types.Image(sample_compression="png"))

  records_loaded = dataset.records(record_set=i)
  print("number of images in the dataset: {}".format(len(list(records_loaded))))
  for j,record in tqdm(enumerate(records_loaded), total=len(list(records_loaded))):
    arr = np.asarray(record['images/image_content'])
    if len(arr.shape) == 2: continue
    ds.append([{
        "record_set": i,
        "filename": record['images/image_filename'].decode("utf-8"),
        "split": record['images/split'].decode("utf-8"),
        "label": record['images/label'].decode("utf-8"),
        "description": "to be added",
        "embedding": np.zeros(1024),
        "image": np.asarray(record['images/image_content'])
    }])
    if j > 20: # Comment out if you want to process the entire dataset
      break

stop_time = datetime.now()
execution_time = stop_time - start_time

print(f"Execution time: {execution_time}")
ds.summary()

Let's extract an image for some downstream task (in this case, LLaVA image description) and save the description back to the Deep Lake object. By this logic, one can stepwise enrich limited datasets with context from other sources.

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model_id = "llava-hf/llava-onevision-qwen2-0.5b-si-hf"

pipe = pipeline("image-text-to-text",
               model=model_id,
               model_kwargs={"quantization_config": quantization_config})

for image_id in tqdm(range(len(ds)), total=len(ds)):
  imgInput = Image.fromarray(ds[image_id]["image"]).convert("RGB")

  messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Describe this image? Use complicated medical jargon"},
        ],
    },
  ]

  outputs = pipe(
      images=imgInput,
      text=messages,
      generate_kwargs={"max_new_tokens": 500}
  )

  # Access the 'generated_text' key and save it back to the deep lake DB
  if outputs and outputs[0] and 'generated_text' in outputs[0]:
      generated_text_list = outputs[0]['generated_text']
      for item in generated_text_list:
          if 'content' in item:
              ds[image_id]["description"] = str(item["content"])
  else:
      print("Could not find 'generated_text' in the output.")

Next the description is turned into embeddings. This is needed to later search for similarity based on an input query. Similarity is measured in the embedding space of text. The MTEB leaderboard gives an overview of the relevant and best performing models. See link.

from transformers import AutoModel, AutoTokenizer
import torch

# Load the pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')

def embedding_function(texts):
    if isinstance(texts, str):
        texts = [texts]

    # Tokenize the texts
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Mean pooling - take attention mask into account for correct averaging
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings.tolist()

for i in range(len(ds)):
    description = ds[i]["description"]
    if description: # Check if description is not empty
        embeddings = embedding_function(description)
        ds[i]["embedding"] = embeddings[0] # Assuming you want to store a single embedding per image
    else:
        # Handle cases where description is empty, e.g., set embedding to zeros or None
        ds[i]["embedding"] = np.zeros(model.config.hidden_size).tolist() # Assuming model.config.hidden_size gives the embedding dimension

print("Text embeddings created and saved to the 'embedding' column.")

And now let’s explore our dataset of Chest CT scans by using text queries. We have now created our own multi-modal dataset, and thanks to advanced retrieval techniques that are served via Deep Lake "out-of-the-box", we can use natural language to explore our image dataset. More general description of the method is here: link.

query = "trachea"
embed_query = embedding_function(query)[0]
str_query = ",".join(str(c) for c in embed_query)

query_vs = f"""
    SELECT *, cosine_similarity(embedding, ARRAY[{str_query}]) as score
    FROM (
        SELECT *, ROW_NUMBER() AS row_id
    )
    ORDER BY cosine_similarity(embedding, ARRAY[{str_query}]) DESC

    LIMIT 3
"""

view_vs = ds.query(query_vs)

for row in view_vs:
    print(f"filename: {row['filename']} \ndescription: {row['description']} \nwith score: {row['score']}")
    print(10*'#')