How To: Do Rubric evaluation for chat Synthetic data (gpt-oss:20b) in Labelbox

Hello Community :waving_hand:,

In our latest iteration of the Multi-Modal Chat (MMC) editor, we’ve introduced Rubric - a powerful evaluation framework for human-in-the-loop processes.

This new feature allows labelers to create custom evaluation schemas directly within model-user conversation threads. The integrated scoring system and classifier enable more nuanced analysis, allowing evaluators to assess and comment on conversation parameters in ways that go beyond deterministic metrics.

Building on our previous guides for synthetic data generation, we leveraged OpenAI’s open source weight model (gpt-oss:20b) to generate synthetic conversations based on predefined prompts.

Early results from our pilot batch suggest potential for implementing a RUBICON approach - enabling both automated evaluation and dynamic rubric generation.

Here is a sample script to get you started, note, this can be used for the live version of the MMC (with real-time inference from industry leading models) via Foundry.

Synthetic data generation script

Code to generate the synthetic chat locally
import json
import os
import time
from typing import List, Dict, Any, Optional
from azure.storage.blob import BlobServiceClient
import subprocess
from tqdm import tqdm
import re
import uuid 


class OllamaTopicGenerator:
    def __init__(self, 
                 #new openai open source model version
                 model: str = 'gpt-oss:20b', 
                 topics: Optional[List[str]] = None, 
                 num_responses: int = 2,
                 temperature: float = 0.1,
                 top_p: float = 0.9,
                 connect_str: Optional[str] = None,
                 container_name: Optional[str] = None):
        """
        Initialize the topic generator.
        
        Args:
            model: Ollama model to use
            topics: List of topics/prompts to explore
            num_responses: Number of responses to generate per topic
            connect_str: Azure Storage connection string
            container_name: Azure Storage container name
        """
        self.model = model
        self.topics = topics or []
        self.num_responses = max(1, num_responses)
        self.temperature = temperature
        self.top_p = top_p
        
        # Azure Storage setup
        self.connect_str = connect_str
        self.container_name = container_name
        self.blob_service_client = None
        self.container_client = None
        
        if connect_str and container_name:
            self._initialize_azure_clients()

    def _sanitize_text(self, text: str) -> str:
        """
        Remove common "thinking/analysis" artifacts and speaker labels.

        This targets patterns sometimes emitted by models, such as:
        - XML-like tags: <thinking>...</thinking>, <analysis>...</analysis>
        - Lines starting with "Thinking:", "Analysis:", "Reasoning:", "Thoughts:"
        - Bracketed tokens like [Thinking], [Analysis]
        - Speaker labels like "Assistant:" or "User:"
        - Surrounding quotes
        """
        try:
            import re as _re

            cleaned = text.strip()
            # Remove explicit Thinking blocks: starts with "Thinking..." and ends with "...done thinking."
            cleaned = _re.sub(r"`?Thinking\.\.\.`?[\s\S]*?`?\.\.\.done thinking\.`?", "", cleaned, flags=_re.IGNORECASE)
            cleaned = _re.sub(r"<\s*(thinking|analysis|reasoning)\s*>[\s\S]*?<\s*/\s*\1\s*>", "", cleaned, flags=_re.IGNORECASE)
            cleaned = _re.sub(r"^(Thinking|Analysis|Reasoning|Thoughts)\s*:\s*.*$", "", cleaned, flags=_re.IGNORECASE | _re.MULTILINE)
            cleaned = _re.sub(r"\[(Thinking|Analysis|Reasoning|Chain of Thought)\]", "", cleaned, flags=_re.IGNORECASE)
            cleaned = _re.sub(r"^(Assistant|User)\s*:\s*", "", cleaned, flags=_re.IGNORECASE)
            cleaned = cleaned.strip().strip('"').strip("'")
            cleaned = _re.sub(r"\s+", " ", cleaned).strip()
            return cleaned
        except Exception:
            return text.strip()

    def _initialize_azure_clients(self) -> None:
        """Initialize Azure Storage clients."""
        try:
            self.blob_service_client = BlobServiceClient.from_connection_string(self.connect_str)
            self.container_client = self.blob_service_client.get_container_client(self.container_name)
        except Exception as e:
            print(f"Error initializing Azure clients: {e}")
            self.blob_service_client = None
            self.container_client = None

    def generate_topic_response(self, topic: str) -> Optional[Dict[str, Any]]:
        """
        Generate a synthetic 5-message conversation for a specific topic using Ollama.
        
        Args:
            topic: Topic/prompt to explore
        
        Returns:
            Dictionary with topic details and generated conversation turns, or None if generation fails
        """
        try:
            # 1) Assistant's first reply to the initial user topic
            model_response_prompt_1 = (
                "You are the AI assistant. Respond helpfully and concisely in 2-4 sentences. "
                "Do not show analysis, chain-of-thought, or meta commentary. "
                "Avoid repeating the user's wording. Output only the assistant's message.\n\n"
                f"User: {topic}\n\nAssistant:"
            )
            start_time_1 = time.time()
            result_1 = subprocess.run(
                ['ollama', 'run', self.model, model_response_prompt_1],
                capture_output=True,
                text=True,
                timeout=180
            )
            if result_1.returncode != 0:
                raise subprocess.CalledProcessError(result_1.returncode, result_1.args, result_1.stdout, result_1.stderr)
            model_response_1 = self._sanitize_text(result_1.stdout)
            t1 = round(time.time() - start_time_1, 2)

            # 2) Human's reply (synthetic) to assistant's first message
            user_reply_prompt_1 = (
                "Simulate the human's reply to the assistant. Be conversational and add a new detail or question. "
                "Keep it under 2 sentences. No analysis or labels. Output only the message.\n\n"
                f"Conversation so far:\nUser: {topic}\nAssistant: {model_response_1}"
            )
            start_time_2 = time.time()
            result_2 = subprocess.run(
                ['ollama', 'run', self.model, user_reply_prompt_1],
                capture_output=True,
                text=True,
                timeout=180
            )
            if result_2.returncode != 0:
                raise subprocess.CalledProcessError(result_2.returncode, result_2.args, result_2.stdout, result_2.stderr)
            user_reply_1 = self._sanitize_text(result_2.stdout)
            t2 = round(time.time() - start_time_2, 2)

            # 3) Assistant's second reply
            model_response_prompt_2 = (
                "You are the AI assistant. Continue the conversation helpfully in 2-4 sentences. "
                "Avoid meta commentary or chain-of-thought. Output only the assistant's message.\n\n"
                f"Conversation so far:\nUser: {topic}\nAssistant: {model_response_1}\nUser: {user_reply_1}\n\nAssistant:"
            )
            start_time_3 = time.time()
            result_3 = subprocess.run(
                ['ollama', 'run', self.model, model_response_prompt_2],
                capture_output=True,
                text=True,
                timeout=180
            )
            if result_3.returncode != 0:
                raise subprocess.CalledProcessError(result_3.returncode, result_3.args, result_3.stdout, result_3.stderr)
            model_response_2 = self._sanitize_text(result_3.stdout)
            t3 = round(time.time() - start_time_3, 2)

            # 4) Human's final one-sentence wrap-up
            user_final_prompt = (
                "Simulate the human's final message. One sentence max, natural acknowledgement or brief follow-up. "
                "No repetition, no analysis, no labels. Output only the message.\n\n"
                f"Conversation so far:\nUser: {topic}\nAssistant: {model_response_1}\nUser: {user_reply_1}\nAssistant: {model_response_2}"
            )
            start_time_4 = time.time()
            result_4 = subprocess.run(
                ['ollama', 'run', self.model, user_final_prompt],
                capture_output=True,
                text=True,
                timeout=180
            )
            if result_4.returncode != 0:
                raise subprocess.CalledProcessError(result_4.returncode, result_4.args, result_4.stdout, result_4.stderr)
            user_final = self._sanitize_text(result_4.stdout)
            t4 = round(time.time() - start_time_4, 2)

        except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
            print(f"Error generating conversation for topic: {topic}")
            print(f"Error details: {str(e)}")
            return None

        return {
            'topic': topic,
            'model_response_1': model_response_1,
            'user_reply_1': user_reply_1,
            'model_response_2': model_response_2,
            'user_final': user_final,
            'inference_times': [t1, t2, t3, t4],
            'model': self.model
        }

    def generate_dataset(self) -> List[Dict[str, Any]]:
        """
        Generate responses for all topics.
        
        Returns:
            List of topic responses
        """
        dataset = []
        
        for topic in tqdm(self.topics, desc=f"Exploring Topics ({self.num_responses} responses each)"):
            if response := self.generate_topic_response(topic):
                dataset.append(response)
        
        return dataset

    def upload_to_azure(self, data: str, blob_name: str) -> bool:
        """
        Upload data to Azure Blob Storage.
        
        Args:
            data: Data to upload
            blob_name: Name of the blob
        
        Returns:
            True if successful, False otherwise
        """
        if not (self.blob_service_client and self.container_client):
            print("Azure Storage clients not properly initialized")
            return False
            
        try:
            data_bytes = data.encode('utf-8')
            blob_client = self.container_client.get_blob_client(blob_name)
            blob_client.upload_blob(data_bytes, overwrite=True)
            print(f"Uploaded to Azure: {blob_name}")
            return True
            
        except Exception as e:
            print(f"Error uploading to Azure: {e}")
            return False

    def generate_labelbox_json(self, dataset: List[Dict[str, Any]], save_locally: bool = False) -> None:
        """
        Convert dataset to Labelbox JSON format and save to Azure and/or locally.
        
        Args:
            dataset: List of generated topic responses
            save_locally: Whether to also save files locally
        """
        for data in dataset:
            filename = self._sanitize_filename(data['topic']) + '.json'
            
            # Generate unique IDs for actors and messages (linear chain of 5 messages)
            user_id = str(uuid.uuid4())
            model_id = str(uuid.uuid4())
            message1_id = str(uuid.uuid4())  # initial user topic
            message2_id = str(uuid.uuid4())  # model response 1
            message3_id = str(uuid.uuid4())  # user reply 1 (synthetic)
            message4_id = str(uuid.uuid4())  # model response 2
            message5_id = str(uuid.uuid4())  # user final (synthetic)
            
            labelbox_json = {
                "type": "application/vnd.labelbox.conversational.model-chat-evaluation",
                "version": 2,
                "actors": {
                    user_id: {
                        "role": "human",
                        "metadata": {"name": "User"}
                    },
                    model_id: {
                        "role": "model",
                        "metadata": {
                            "modelConfigName": data['model'],
                            "modelConfigId": str(uuid.uuid4())
                        }
                    }
                },
                "messages": {
                    message1_id: {
                        "actorId": user_id,
                        "content": [
                            {"type": "text", "content": data['topic']}
                        ],
                        "childMessageIds": [message2_id]
                    },
                    message2_id: {
                        "actorId": model_id,
                        "content": [
                            {"type": "text", "content": data['model_response_1']}
                        ],
                        "childMessageIds": [message3_id]
                    },
                    message3_id: {
                        "actorId": user_id,
                        "content": [
                            {"type": "text", "content": data['user_reply_1']}
                        ],
                        "childMessageIds": [message4_id]
                    },
                    message4_id: {
                        "actorId": model_id,
                        "content": [
                            {"type": "text", "content": data['model_response_2']}
                        ],
                        "childMessageIds": [message5_id]
                    },
                    message5_id: {
                        "actorId": user_id,
                        "content": [
                            {"type": "text", "content": data['user_final']}
                        ],
                        "childMessageIds": []
                    }
                },
                "rootMessageIds": [message1_id]
            }
            
            json_str = json.dumps(labelbox_json, indent=2, ensure_ascii=False)
            
            if self.blob_service_client and self.container_client:
                self.upload_to_azure(json_str, filename)
            
            if save_locally:
                os.makedirs('labelbox_outputs', exist_ok=True)
                output_path = os.path.join('labelbox_outputs', filename)
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(json_str)
                print(f"Saved locally: {output_path}")

    @staticmethod
    def _sanitize_filename(filename: str) -> str:
        """
        Convert topic to a valid filename.
        
        Args:
            filename: Original filename/topic
        
        Returns:
            Sanitized filename
        """
        filename = re.sub(r'[<>:"/\\|?*]', '', filename)
        filename = filename.replace(' ', '_')[:100].strip('_')
        return filename or 'unnamed_topic'

def main():
    # Azure Storage configuration
    connect_str = os.getenv('connect_str')  # Changed to standard env var name
    container_name = "synthetic-chat-eval"

    # Example usage with math topics
    chat_eval = [
    "Why am I getting a 'SAML InResponseTo validation failed' error in Auth0?",
    "What does 'Connection refused: connect' error mean and how can I resolve it?",
    "I'm encountering a NullPointerException while invoking the 'getEmail()' method. What could be the issue?",
    "Why is my SQLite database throwing a 'no such table' error?",
    "What is causing the 'Cannot read property 'length' of undefined' error in my JavaScript code?"
    ]

    try:
        generator = OllamaTopicGenerator(
            model='gpt-oss:20b',
            temperature=0.1,
            top_p=0.9,
            topics=chat_eval,
            num_responses=1,
            connect_str=connect_str,
            container_name=container_name
        )
        
        topic_responses = generator.generate_dataset()
        generator.generate_labelbox_json(topic_responses, save_locally=True) #if you need to want to keep a local copy
        
    except Exception as e:
        print(f"Error in main execution: {e}")

if __name__ == '__main__':
    main()

Send the data to Labelbox*

Code to send your Labelbox formated inference file(s) to Azure
import labelbox as lb
from azure.storage.blob import BlobServiceClient, ContainerClient
import os

API_KEY = ''
client = lb.Client(api_key=API_KEY)

# Azure connection string
connect_str = os.environ.get('connect_str')
service = BlobServiceClient.from_connection_string(connect_str)
container_name = 'synthetic-chat-eval'
container_client = service.get_container_client(container_name)

#retrieve LB integration
organization = client.get_organization()
iam_integration = organization.get_iam_integrations()[0]
iam_integration.name

dataset = client.create_dataset(name=f"Azure_{container_name}",iam_integration=iam_integration)
blob_list = container_client.list_blobs()

uploads = []
for blob in blob_list:
    url = f"{service.url}{container_client.container_name}/{blob.name}"
    uploads.append(dict(row_data = url, global_key = blob.name))

task = dataset.create_data_rows(uploads)
task.wait_till_done()
print(task.errors)

Configure the MMC (offline) in Labelbox

And…

Tada! Happy labeling

*You can use other cloud provider to store your data.

2 Likes