Hello Community ,
Synthetic data is important in training AI for divers reasons, it improve accuracy, reduced bias, enhance generalisation and improve applicability.
Now, that being said, we still need data that resembles the real world and in sufficient quantity to develop and evaluate machine learning pipelines. For curating, correcting and or expand that data, whatโs better than using Labelbox !
Note: you can also evaluate responses from other LLMs and use alternative data to further your end goal.
Requirements
In this guide this what you will need to follow along
- Ollama
- deepseek-r1 via Ollama (I used the 8B version)
- A cloud provider of your choice (I used Azure) to store the files
- An integration to work with your newly created Labelbox offline MMC files
Once you are ready come up with prompt topic (math_prompts
) you would like to have the model generate responses, here we set it to 2 but you can go up to 10.
Synthetic data generation script
Code to generate the inference locally
import json
import os
import time
from typing import List, Dict, Any, Optional
import subprocess
from tqdm import tqdm
import re
from azure.storage.blob import BlobServiceClient, ContainerClient
import uuid
class OllamaTopicGenerator:
def __init__(self,
model: str = 'deepseek-r1:8b',
topics: Optional[List[str]] = None,
num_responses: int = 2,
connect_str: Optional[str] = None,
container_name: Optional[str] = None):
"""
Initialize the topic generator.
Args:
model: Ollama model to use
topics: List of topics/prompts to explore
num_responses: Number of responses to generate per topic
connect_str: Azure Storage connection string
container_name: Azure Storage container name
"""
self.model = model
self.topics = topics or []
self.num_responses = max(1, num_responses)
# Azure Storage setup
self.connect_str = connect_str
self.container_name = container_name
self.blob_service_client = None
self.container_client = None
if connect_str and container_name:
self._initialize_azure_clients()
def _initialize_azure_clients(self) -> None:
"""Initialize Azure Storage clients."""
try:
self.blob_service_client = BlobServiceClient.from_connection_string(self.connect_str)
self.container_client = self.blob_service_client.get_container_client(self.container_name)
except Exception as e:
print(f"Error initializing Azure clients: {e}")
self.blob_service_client = None
self.container_client = None
def generate_topic_response(self, topic: str) -> Optional[Dict[str, Any]]:
"""
Generate multiple responses for a specific topic using Ollama.
Args:
topic: Topic/prompt to explore
Returns:
Dictionary with topic details and LLM responses, or None if generation fails
"""
responses = []
inference_times = []
try:
for _ in range(self.num_responses):
start_time = time.time()
# Add --max-tokens parameter to Ollama command
result = subprocess.run([
'ollama', 'run',
self.model,
topic
],
capture_output=True,
text=True,
timeout=180)
if result.returncode != 0:
raise subprocess.CalledProcessError(result.returncode, result.args, result.stdout, result.stderr)
inference_time = round(time.time() - start_time, 2)
responses.append(result.stdout.strip())
inference_times.append(inference_time)
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
print(f"Error generating response for topic: {topic}")
print(f"Error details: {str(e)}")
return None
return {
'topic': topic,
'LLM_responses': responses,
'inference_times': inference_times,
'model': self.model,
'num_responses': self.num_responses
}
def generate_dataset(self) -> List[Dict[str, Any]]:
"""
Generate responses for all topics.
Returns:
List of topic responses
"""
dataset = []
for topic in tqdm(self.topics, desc=f"Exploring Topics ({self.num_responses} responses each)"):
if response := self.generate_topic_response(topic):
dataset.append(response)
return dataset
def upload_to_azure(self, data: str, blob_name: str) -> bool:
"""
Upload data to Azure Blob Storage.
Args:
data: Data to upload
blob_name: Name of the blob
Returns:
True if successful, False otherwise
"""
if not (self.blob_service_client and self.container_client):
print("Azure Storage clients not properly initialized")
return False
try:
data_bytes = data.encode('utf-8')
blob_client = self.container_client.get_blob_client(blob_name)
blob_client.upload_blob(data_bytes, overwrite=True)
print(f"Uploaded to Azure: {blob_name}")
return True
except Exception as e:
print(f"Error uploading to Azure: {e}")
return False
def generate_labelbox_json(self, dataset: List[Dict[str, Any]], save_locally: bool = False) -> None:
"""
Convert dataset to Labelbox JSON format and save to Azure and/or locally.
Args:
dataset: List of generated topic responses
save_locally: Whether to also save files locally
"""
for data in dataset:
filename = self._sanitize_filename(data['topic']) + '.json'
# Generate unique IDs for each message
user_id = str(uuid.uuid4())
model1_id = str(uuid.uuid4())
model2_id = str(uuid.uuid4())
message_id = str(uuid.uuid4())
response1_id = str(uuid.uuid4())
response2_id = str(uuid.uuid4())
labelbox_json = {
"type": "application/vnd.labelbox.conversational.model-chat-evaluation",
"version": 2,
"actors": {
user_id: {
"role": "human",
"metadata": {"name": "user"}
},
model1_id: {
"role": "model",
"metadata": {
"modelConfigName": "plain-Copy",
"modelConfigId": str(uuid.uuid4())
}
},
model2_id: {
"role": "model",
"metadata": {
"modelConfigName": data['model'],
"modelConfigId": str(uuid.uuid4())
}
}
},
"messages": {
message_id: {
"actorId": user_id,
"content": [
{"type": "text", "content": data['topic']}
],
"childMessageIds": [response1_id, response2_id]
},
response1_id: {
"actorId": model1_id,
"content": [
{"type": "text", "content": data['LLM_responses'][0]}
],
"childMessageIds": []
},
response2_id: {
"actorId": model2_id,
"content": [
{"type": "text", "content": data['LLM_responses'][1]}
],
"childMessageIds": []
}
},
"rootMessageIds": [message_id]
}
json_str = json.dumps(labelbox_json, indent=2, ensure_ascii=False)
if self.blob_service_client and self.container_client:
self.upload_to_azure(json_str, filename)
if save_locally:
os.makedirs('labelbox_outputs', exist_ok=True)
output_path = os.path.join('labelbox_outputs', filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(json_str)
print(f"Saved locally: {output_path}")
@staticmethod
def _sanitize_filename(filename: str) -> str:
"""
Convert topic to a valid filename.
Args:
filename: Original filename/topic
Returns:
Sanitized filename
"""
filename = re.sub(r'[<>:"/\\|?*]', '', filename)
filename = filename.replace(' ', '_')[:100].strip('_')
return filename or 'unnamed_topic'
def main():
# Azure Storage configuration
connect_str = os.getenv('connect_str') # Changed to standard env var name
container_name = "synthetic-data-maths"
# Example usage with math topics
math_prompts = [
"How are quadratic equations solved in Algebra?",
"What are the key principles of differentiation in Calculus?",
"How are the properties of triangles and circles explored in Geometry?",
"In what ways does Trigonometry relate to the study of periodic phenomena?",
"What is the significance of eigenvalues and eigenvectors in Linear Algebra?",
"How are ordinary differential equations used to model physical systems in Differential Equations?",
"What are the applications of Discrete Mathematics in computer science?",
"What does Number Theory reveal about the properties of prime numbers?",
"How are complex numbers and functions analyzed in Complex Analysis?",
"What are the key concepts of point-set and algebraic topology?",
"How does Probability Theory describe random events and their likelihood?",
"What are the foundational principles of inferential and descriptive Statistics?",
"How does Set Theory inform our understanding of the relationships between objects?",
"In what ways can Logic be applied to mathematical reasoning?",
"What are some real-world applications of Game Theory in economics and social sciences?",
"How does Group Theory explore the symmetries of mathematical structures?",
"What combinatorial techniques are used to count and analyze finite sets in Combinatorics?",
"What is the role of limits, continuity, and differentiation in Mathematical Analysis?",
"How are numerical methods applied to solve mathematical problems in Numerical Analysis?",
"What are some real-world applications of Applied Mathematics in various fields?"
]
try:
generator = OllamaTopicGenerator(
model='deepseek-r1:8b',
topics=math_prompts,
num_responses=2,
connect_str=connect_str,
container_name=container_name
)
topic_responses = generator.generate_dataset()
generator.generate_labelbox_json(topic_responses, save_locally=True) #if you need to want to keep a local copy
except Exception as e:
print(f"Error in main execution: {e}")
if __name__ == '__main__':
main()
Send the data to Labelbox
Then we send the data from Azure to Labelbox:
Code to send your Labelbox formated inference file(s) to Azure
import labelbox as lb
import os
API_KEY = os.environ.get('LABELBOX')
client = lb.Client(api_key=API_KEY)
# Azure connection string
connect_str = os.environ.get('connect_str')
container_name = 'synthetic-data-maths'
container_client = service.get_container_client(container_name)
#retrieve LB integration
organization = client.get_organization()
iam_integration = organization.get_iam_integrations()[0]
iam_integration.name
dataset = client.create_dataset(name=f"Azure_{container_name}",iam_integration=iam_integration)
blob_list = container_client.list_blobs()
uploads = []
for blob in blob_list:
url = f"{service.url}{container_client.container_name}/{blob.name}"
uploads.append(dict(row_data = url, global_key = blob.name))
task = dataset.create_data_rows(uploads)
task.wait_till_done()
print(task.errors)
Configure the MMC offline in Labelbox
- Go to Labelbox
- Select Multimodal chat
- Choose the Offline chat
You can also follow our documentation to set this up as well as creating batches and the ontology you need.
Then, click on start labeling andโฆ
Happy labeling!