While Labelbox supports most cloud providers, we can also support alternatives like Nebius (a platform that provides storage and training for models).
In this guide we will import data from Nebius to Labelbox and go around some limitations on the token expiry (max 7 days).
Requirement:
-
A Labelbox account (API KEY)
-
A Nebius account with a service account (read data)
-
boto3 package installed
We will assume you want to import all your data and that there is no duplication in names (using global_key).
First let’s set up a Nebius Service Account:
IAM → Service Account → Create entity
Once created you can use a custom group or allocate the same level of permission of the pre-existing editors.
Create a key pair and save it securely.
Before uploading, make sure to set a CORS policy so data can be accessed in Labelbox
import boto3
import os
s3_client = boto3.client(
service_name='s3',
endpoint_url='https://storage.eu-west1.nebius.cloud',
aws_access_key_id='<SERVICE_ACCOUNT_KEY_ID>',
aws_secret_access_key='<SERVICE_ACCOUNT_ACCESS_KEY>'
)
bucket_name = 'white-stingray-bucket-6'
# CORS configuration for Labelbox
cors_configuration = {
'CORSRules': [{
'AllowedHeaders': ['*'],
'AllowedMethods': ['GET', 'HEAD'],
'AllowedOrigins': [
'https://app.labelbox.com',
'https://editor.labelbox.com'
],
'ExposeHeaders': [],
}]
}
try:
s3_client.put_bucket_cors(
Bucket=bucket_name,
CORSConfiguration=cors_configuration
)
print(f"✓ CORS configured for bucket: {bucket_name}")
# Verify CORS settings
response = s3_client.get_bucket_cors(Bucket=bucket_name)
print(f"Current CORS rules: {response['CORSRules']}")
except Exception as e:
print(f"Error configuring CORS: {e}")
Once done lets get your data to Labelbox:
# Connect to Nebius via boto3
s3_client = boto3.client(
service_name='s3',
endpoint_url='https://storage.eu-west1.nebius.cloud',
aws_access_key_id='<SERVICE_ACCOUNT_KEY_ID>',
aws_secret_access_key='<SERVICE_ACCOUNT_ACCESS_KEY>'
)
bucket_name = 'white-stingray-bucket-6'
response = s3_client.list_objects_v2(Bucket=bucket_name)
files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith(('.jpeg'))]
data_rows = []
for file_key in files:
url = s3_client.generate_presigned_url(
'get_object',
Params={'Bucket': bucket_name, 'Key': file_key},
ExpiresIn=604800 # 7 days in seconds this is current max
)
data_rows.append({
"row_data": url,
"global_key": file_key.split('/')[-1].replace('.jpeg', '')
})
# Import to Labelbox
lb_client = lb.Client(api_key='<LABELBOX_API_KEY>')
dataset = lb_client.create_dataset(name="Image from Nebius", iam_integration=None)
task = dataset.create_data_rows(data_rows)
task.wait_till_done()
print(f"Imported {len(data_rows)} files to Labelbox")
print(f"Dataset ID: {dataset.uid}")
Now 7 days is a limitation for longer-running projects you may have, we can run a cron job to update the token.
Here is a script that can do that.
def regenerate_urls(dataset_id, bucket_name):
"""
Regenerate URLs for all data rows in a dataset
Works without metadata by reconstructing S3 paths from external_ids
Args:
dataset_id: Labelbox dataset ID
bucket_name: Your Nebius bucket name
"""
import boto3
import labelbox as lb
import os
from urllib.parse import urlparse, unquote
# Initialize clients
s3_client = boto3.client(
service_name='s3',
endpoint_url='https://storage.eu-west1.nebius.cloud',
aws_access_key_id='<SERVICE_ACCOUNT_KEY_ID>',
aws_secret_access_key='<SERVICE_ACCOUNT_ACCESS_KEY>'
)
lb_client = lb.Client(api_key='<LABELBOX_API_KEY>')
dataset = lb_client.get_dataset(dataset_id)
print(f"Regenerating URLs for dataset: {dataset.name}")
data_rows = list(dataset.data_rows())
print(f"Found {len(data_rows)} data rows")
updated_count = 0
failed_count = 0
for i, data_row in enumerate(data_rows, 1):
try:
# Method 1: Try to parse existing URL to extract S3 key
current_url = data_row.row_data
s3_key = None
if current_url and 'storage' in current_url:
parsed = urlparse(current_url)
path_parts = parsed.path.strip('/').split('/', 1)
if len(path_parts) >= 2:
# path_parts[0] is bucket, path_parts[1] is the key
s3_key = unquote(path_parts[1])
# Method 2: Reconstruct from external_id if parsing failed
if not s3_key:
s3_key = f"{prefix}{data_row.external_id}{file_extension}"
# Generate new signed URL (7 days)
new_url = s3_client.generate_presigned_url(
'get_object',
Params={'Bucket': bucket_name, 'Key': s3_key},
ExpiresIn=604800
)
# Update data row
data_row.update(row_data=new_url)
updated_count += 1
except Exception as e:
failed_count += 1
print(f"[{i}/{len(data_rows)}] Failed {data_row.external_id}: {e}")
print(f"\n{'='*60}")
print(f"Successfully updated: {updated_count}/{len(data_rows)}")
if failed_count > 0:
print(f"Failed: {failed_count}/{len(data_rows)}")
print(f"{'='*60}")
# Usage example
if __name__ == '__main__':
import sys
if len(sys.argv) < 2:
print("Usage: python refresh_urls.py <dataset_id>")
sys.exit(1)
dataset_id = '<DATASET_ID>'
# Configuration
BUCKET_NAME = 'white-stingray-bucket-6'
regenerate_urls(
dataset_id=dataset_id,
bucket_name=BUCKET_NAME,
)

