How To: Upload your data from Nebius Cloud to Labelbox

While Labelbox supports most cloud providers, we can also support alternatives like Nebius (a platform that provides storage and training for models).

In this guide we will import data from Nebius to Labelbox and go around some limitations on the token expiry (max 7 days).

Requirement:

  • A Labelbox account (API KEY)

  • A Nebius account with a service account (read data)

  • boto3 package installed

We will assume you want to import all your data and that there is no duplication in names (using global_key).

First let’s set up a Nebius Service Account:

IAM → Service Account → Create entity

Once created you can use a custom group or allocate the same level of permission of the pre-existing editors.

Create a key pair and save it securely.

Before uploading, make sure to set a CORS policy so data can be accessed in Labelbox

import boto3
import os

s3_client = boto3.client(
    service_name='s3',
    endpoint_url='https://storage.eu-west1.nebius.cloud',
    aws_access_key_id='<SERVICE_ACCOUNT_KEY_ID>',
    aws_secret_access_key='<SERVICE_ACCOUNT_ACCESS_KEY>'
)

bucket_name = 'white-stingray-bucket-6'

# CORS configuration for Labelbox
cors_configuration = {
    'CORSRules': [{
        'AllowedHeaders': ['*'],
        'AllowedMethods': ['GET', 'HEAD'],
        'AllowedOrigins': [
            'https://app.labelbox.com',
            'https://editor.labelbox.com'
        ],
        'ExposeHeaders': [],
    }]
}

try:
    s3_client.put_bucket_cors(
        Bucket=bucket_name,
        CORSConfiguration=cors_configuration
    )
    print(f"✓ CORS configured for bucket: {bucket_name}")
    
    # Verify CORS settings
    response = s3_client.get_bucket_cors(Bucket=bucket_name)
    print(f"Current CORS rules: {response['CORSRules']}")
    
except Exception as e:
    print(f"Error configuring CORS: {e}")

Once done lets get your data to Labelbox:

# Connect to Nebius via boto3
s3_client = boto3.client(
    service_name='s3',
    endpoint_url='https://storage.eu-west1.nebius.cloud',
    aws_access_key_id='<SERVICE_ACCOUNT_KEY_ID>',
    aws_secret_access_key='<SERVICE_ACCOUNT_ACCESS_KEY>'
)

bucket_name = 'white-stingray-bucket-6'

response = s3_client.list_objects_v2(Bucket=bucket_name)
files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith(('.jpeg'))]

data_rows = []
for file_key in files:
    
    url = s3_client.generate_presigned_url(
        'get_object',
        Params={'Bucket': bucket_name, 'Key': file_key},
        ExpiresIn=604800  # 7 days in seconds this is current max
    )
    
    data_rows.append({
        "row_data": url,
        "global_key": file_key.split('/')[-1].replace('.jpeg', '')
    })

# Import to Labelbox
lb_client = lb.Client(api_key='<LABELBOX_API_KEY>')
dataset = lb_client.create_dataset(name="Image from Nebius", iam_integration=None)
task = dataset.create_data_rows(data_rows)
task.wait_till_done()

print(f"Imported {len(data_rows)} files to Labelbox")
print(f"Dataset ID: {dataset.uid}")

Now 7 days is a limitation for longer-running projects you may have, we can run a cron job to update the token.

Here is a script that can do that.

def regenerate_urls(dataset_id, bucket_name):
    """
    Regenerate URLs for all data rows in a dataset
    Works without metadata by reconstructing S3 paths from external_ids
    
    Args:
        dataset_id: Labelbox dataset ID
        bucket_name: Your Nebius bucket name
    """
    
    import boto3
    import labelbox as lb
    import os
    from urllib.parse import urlparse, unquote
    
    # Initialize clients
    s3_client = boto3.client(
        service_name='s3',
        endpoint_url='https://storage.eu-west1.nebius.cloud',
        aws_access_key_id='<SERVICE_ACCOUNT_KEY_ID>',
        aws_secret_access_key='<SERVICE_ACCOUNT_ACCESS_KEY>'
    )
    
    lb_client = lb.Client(api_key='<LABELBOX_API_KEY>')
    dataset = lb_client.get_dataset(dataset_id)
    
    print(f"Regenerating URLs for dataset: {dataset.name}")
    
    data_rows = list(dataset.data_rows())
    print(f"Found {len(data_rows)} data rows")
    
    updated_count = 0
    failed_count = 0
    
    for i, data_row in enumerate(data_rows, 1):
        try:
            # Method 1: Try to parse existing URL to extract S3 key
            current_url = data_row.row_data
            s3_key = None
            
            if current_url and 'storage' in current_url:
            
                parsed = urlparse(current_url)
                path_parts = parsed.path.strip('/').split('/', 1)
                
                if len(path_parts) >= 2:
                    # path_parts[0] is bucket, path_parts[1] is the key
                    s3_key = unquote(path_parts[1])
            
            # Method 2: Reconstruct from external_id if parsing failed
            if not s3_key:
                s3_key = f"{prefix}{data_row.external_id}{file_extension}"
            
            # Generate new signed URL (7 days)
            new_url = s3_client.generate_presigned_url(
                'get_object',
                Params={'Bucket': bucket_name, 'Key': s3_key},
                ExpiresIn=604800
            )
            
            # Update data row
            data_row.update(row_data=new_url)
            updated_count += 1
            
        except Exception as e:
            failed_count += 1
            print(f"[{i}/{len(data_rows)}] Failed {data_row.external_id}: {e}")
    
    print(f"\n{'='*60}")
    print(f"Successfully updated: {updated_count}/{len(data_rows)}")
    if failed_count > 0:
        print(f"Failed: {failed_count}/{len(data_rows)}")
    print(f"{'='*60}")


# Usage example
if __name__ == '__main__':
    import sys
    
    if len(sys.argv) < 2:
        print("Usage: python refresh_urls.py <dataset_id>")
        sys.exit(1)
    
    dataset_id = '<DATASET_ID>'
    
    # Configuration
    BUCKET_NAME = 'white-stingray-bucket-6'
    
    regenerate_urls(
        dataset_id=dataset_id,
        bucket_name=BUCKET_NAME,
    )