Create a training dataset

To train a machine learning model, you will need a dataset that meets the following conditions:

  • At least 15 images
  • At least 80% of the images have labels
  • For each training label, at least 10 examples

When you label your dataset, include:

  • images with and without the categories you’re looking to identify
  • a roughly equal number of images for each category
  • images from your production environment, including lighting and camera quality
  • examples from every angle and distance that you expect the model to handle

Create a dataset

You can create a dataset using the web UI, the CLI, or one of the SDKs:

  1. Navigate to the DATA page and open the DATASETS tab.

  2. Click the + Create dataset button.

    The **DATASET** tab of the **DATA** page, showing the **+ Create dataset** button.
  3. Enter a unique name for the dataset.

  4. Click Create dataset.

Run the following Viam CLI command to create a dataset, replacing the <org-id> and <name> placeholders with your organization ID and a unique name for the dataset:

viam dataset create --org-id=<org-id> --name=<name>

To create a dataset, pass a unique dataset name and organization ID to data_client.create_dataset:

import asyncio

from viam.rpc.dial import DialOptions, Credentials
from viam.app.viam_client import ViamClient

# Configuration constants – replace with your actual values
API_KEY = ""  # API key, find or create in your organization settings
API_KEY_ID = ""  # API key ID, find or create in your organization settings
ORG_ID = ""  # your organization ID, find in your organization settings
DATASET_NAME = ""  # a unique, new name for the dataset you want to create


async def connect() -> ViamClient:
    """Establish a connection to the Viam client using API credentials."""
    dial_options = DialOptions(
        credentials=Credentials(
            type="api-key",
            payload=API_KEY,
        ),
        auth_entity=API_KEY_ID
    )
    return await ViamClient.create_from_dial_options(dial_options)


async def main() -> int:
    viam_client = await connect()
    data_client = viam_client.data_client

    print("Creating dataset...")
    try:
        dataset_id = await data_client.create_dataset(
            name=DATASET_NAME,
            organization_id=ORG_ID,
        )
        print(f"Created dataset: {dataset_id}")
    except Exception as e:
        print("Error creating dataset. It may already exist.")
        print("See: https://app.viam.com/data/datasets")
        print(f"Exception: {e}")
        return 1

    viam_client.close()
    return 0

if __name__ == "__main__":
    asyncio.run(main())

To create a dataset, pass a unique dataset name and organization ID to DataClient.CreateDataset:

package main

import (
	"context"
	"fmt"
	"os"

	"go.viam.com/rdk/app"
	"go.viam.com/rdk/logging"
)

func main() {
	apiKey := ""
	apiKeyID := ""
	orgID := ""
	datasetName := ""

	logger := logging.NewDebugLogger("client")
	ctx := context.Background()
	viamClient, err := app.CreateViamClientWithAPIKey(
		ctx, app.Options{}, apiKey, apiKeyID, logger)
	if err != nil {
		logger.Fatal(err)
	}
	defer viamClient.Close()

	dataClient := viamClient.DataClient()

	fmt.Println("Creating dataset...")
	datasetID, err := dataClient.CreateDataset(ctx, datasetName, orgID)
	if err != nil {
		fmt.Println("Error creating dataset. It may already exist.")
		fmt.Printf("Exception: %v\n", err)
		return
	}
	fmt.Printf("Created dataset: %s\n", datasetID)

}

To create a dataset, pass a unique dataset name and organization ID to dataClient.createDataset:

import { createViamClient } from "@viamrobotics/sdk";

// Configuration constants – replace with your actual values
let API_KEY = "";  // API key, find or create in your organization settings
let API_KEY_ID = "";  // API key ID, find or create in your organization settings
let ORG_ID = "";  // your organization ID, find in your organization settings
let DATASET_NAME = "";  // a unique, new name for the dataset you want to create


async function connect(): Promise<any> {
    // Establish a connection to the Viam client using API credentials
    return await createViamClient({
        credentials: {
            type: "api-key",
            authEntity: API_KEY_ID,
            payload: API_KEY,
        },
    });
}

async function main(): Promise<number> {
    const viamClient = await connect();
    const dataClient = viamClient.dataClient;

    console.log("Creating dataset...");
    try {
        const datasetId = await dataClient.createDataset(
            DATASET_NAME,
            ORG_ID
        );
        console.log(`Created dataset: ${datasetId}`);
    } catch (error) {
        console.log("Error creating dataset. It may already exist.");
        console.log("See: https://app.viam.com/data/datasets");
        console.log(`Exception: ${error}`);
        return 1;
    }

    return 0;
}

// Run the script
main().catch((error) => {
    console.error("Script failed:", error);
    process.exit(1);
});

You can now add images to your dataset.

Add to a dataset

You can add images to a dataset from the Images tab of the DATA page:

  1. Click to select the images you would like to add to your dataset.

  2. Click the Add to dataset button in the top right.

  3. From the Dataset dropdown, select the name of your dataset.

  4. Click Add <n> images to add the selected images to the dataset.

Use the Viam CLI to filter images by label and add the filtered images to a dataset:

  1. First, create a dataset, if you haven’t already.

  2. If you just created a dataset, use the dataset ID output by the creation command. If your dataset already exists, run the following command to get a list of dataset names and corresponding IDs:

    viam dataset list
    
  3. Run the following command to add all images labeled with a subset of tags to the dataset, replacing the <dataset-id> placeholder with the dataset ID output by the command in the previous step:

    viam dataset data add filter --dataset-id=<dataset-id> --tags=red_star,blue_square
    

To add an image to a dataset, find the binary data ID for the image and the dataset ID. Pass both IDs to data_client.add_binary_data_to_dataset_by_ids:

import asyncio

from viam.rpc.dial import DialOptions, Credentials
from viam.app.viam_client import ViamClient

# Configuration constants – replace with your actual values
API_KEY = ""  # API key, find or create in your organization settings
API_KEY_ID = ""  # API key ID, find or create in your organization settings
DATASET_ID = ""  # the ID of the dataset you want to add the image to
BINARY_DATA_ID = ""  # the ID of the image you want to add to the dataset


async def connect() -> ViamClient:
    """Establish a connection to the Viam client using API credentials."""
    dial_options = DialOptions(
        credentials=Credentials(
            type="api-key",
            payload=API_KEY,
        ),
        auth_entity=API_KEY_ID
    )
    return await ViamClient.create_from_dial_options(dial_options)


async def main() -> int:
    viam_client = await connect()
    data_client = viam_client.data_client


    print("Adding image to dataset...")
    await data_client.add_binary_data_to_dataset_by_ids(
        binary_ids=[BINARY_DATA_ID],
        dataset_id=DATASET_ID
    )

    viam_client.close()
    return 0

if __name__ == "__main__":
    asyncio.run(main())

To add an image to a dataset, find the binary data ID for the image and the dataset ID. Pass both IDs to DataClient.AddBinaryDataToDatasetByIDs:

package main

import (
	"context"
	"fmt"

	"go.viam.com/rdk/app"
	"go.viam.com/rdk/logging"
)

func main() {
	apiKey := ""
	apiKeyID := ""
	datasetID := ""
	binaryDataID := ""

	logger := logging.NewDebugLogger("client")
	ctx := context.Background()
	viamClient, err := app.CreateViamClientWithAPIKey(
		ctx, app.Options{}, apiKey, apiKeyID, logger)
	if err != nil {
		logger.Fatal(err)
	}
	defer viamClient.Close()

	dataClient := viamClient.DataClient()


	fmt.Println("Adding image to dataset...")
	err = dataClient.AddBinaryDataToDatasetByIDs(
		ctx,
		[]string{binaryDataID},
		datasetID,
	)
	if err != nil {
		fmt.Println("Error adding image to dataset.")
		fmt.Printf("Exception: %v\n", err)
		return
	}
	fmt.Println("Image added to dataset successfully")
}

To add an image to a dataset, find the binary data ID for the image and the dataset ID. Pass both IDs to dataClient.addBinaryDataToDatasetByIDs:

import { createViamClient } from "@viamrobotics/sdk";

// Configuration constants – replace with your actual values
let API_KEY = "";  // API key, find or create in your organization settings
let API_KEY_ID = "";  // API key ID, find or create in your organization settings
let DATASET_ID = "";  // the ID of the dataset you want to add the image to
let BINARY_DATA_ID = "";  // the ID of the image you want to add to the dataset

async function connect(): Promise<any> {
    // Establish a connection to the Viam client using API credentials
    return await createViamClient({
        credentials: {
            type: "api-key",
            authEntity: API_KEY_ID,
            payload: API_KEY,
        },
    });
}

async function main(): Promise<number> {
    const viamClient = await connect();
    const dataClient = viamClient.dataClient;


    console.log("Adding image to dataset...");
    await dataClient.addBinaryDataToDatasetByIds(
        [BINARY_DATA_ID],
        DATASET_ID
    );

    return 0;
}

// Run the script
main().catch((error) => {
    console.error("Script failed:", error);
    process.exit(1);
});

Add all images captured by a specific machine to a dataset

You can add images to a dataset from the Images tab of the DATA page:

  1. From the Machine name dropdown, select the name of a machine.
  2. Click the Apply button at the bottom of the left sidebar.
  3. Click to select the images you would like to add to your dataset.
  4. Click the Add to dataset button in the top right.
  5. From the Dataset dropdown, select the name of your dataset.
  6. Click Add <n> images to add the selected images to the dataset.

The following script adds all images captured from a certain machine to a new dataset:

import asyncio
from typing import List, Optional
from viam.utils import create_filter

from viam.rpc.dial import DialOptions, Credentials
from viam.app.viam_client import ViamClient

# Configuration constants – replace with your actual values
API_KEY = ""  # API key, find or create in your organization settings
API_KEY_ID = ""  # API key ID, find or create in your organization settings
ORG_ID = ""  # your organization ID, find in your organization settings
PART_ID = ""  # the part ID of the binary data you want to add to the dataset
DATASET_ID = ""  # the ID of the dataset you want to add the image to
MAX_MATCHES = 50  # the maximum number of binary data objects to fetch


async def connect() -> ViamClient:
    """Establish a connection to the Viam client using API credentials."""
    dial_options = DialOptions(
        credentials=Credentials(
            type="api-key",
            payload=API_KEY,
        ),
        auth_entity=API_KEY_ID
    )
    return await ViamClient.create_from_dial_options(dial_options)

async def fetch_binary_data_ids(data_client, part_id: str) -> List[str]:
    """Fetch binary data metadata and return a list of BinaryData objects."""
    data_filter = create_filter(part_id=part_id)
    all_matches = []
    last: Optional[str] = None

    print("Getting data for part...")

    while len(all_matches) < MAX_MATCHES:
        print("Fetching more data...")
        data, _, last = await data_client.binary_data_by_filter(
            data_filter,
            limit=50,
            last=last,
            include_binary_data=False,
        )
        if not data:
            break
        all_matches.extend(data)

    return all_matches

async def main() -> int:
    viam_client = await connect()
    data_client = viam_client.data_client

    matching_data = await fetch_binary_data_ids(data_client, PART_ID)

    await data_client.add_binary_data_to_dataset_by_ids(
        binary_ids=[
            obj.metadata.binary_data_id for obj in matching_data
        ],
        dataset_id=DATASET_ID
    )

    print("Added files to dataset:")
    print(f"https://app.viam.com/data/datasets?id={DATASET_ID}")

    viam_client.close()
    return 0

if __name__ == "__main__":
    asyncio.run(main())

The following script adds all images captured from a certain machine to a new dataset:

package main

import (
	"context"
	"fmt"

	"go.viam.com/rdk/app"
	"go.viam.com/rdk/logging"
)


func fetchBinaryDataIDs(
	ctx context.Context,
	dataClient *app.DataClient,
	partID string,
	maxMatches int) ([]string, error) {
	filter := &app.Filter{
		PartID: partID,
		Interval: app.CaptureInterval{
			Start: time.Now().Add(-200 * time.Hour),
			End: time.Now(),
		},
	}

	var allMatches []string
	last := ""

	fmt.Println("Getting data for part...")

	for len(allMatches) < maxMatches {
		fmt.Println("Fetching more data...")

		resp, err := dataClient.BinaryDataByFilter(
			ctx, false, &app.DataByFilterOptions{
				Filter:            filter,
				Limit:             5,
				Last:              last,
				IncludeInternalData: false,
			},
		)
		if err != nil {
			return nil, fmt.Errorf("failed to fetch binary data: %w", err)
		}
		if len(resp.BinaryData) == 0 {
			break
		}
		for _, data := range resp.BinaryData {
			allMatches = append(allMatches, data.Metadata.BinaryDataID)
		}
		last = resp.Last
	}

	fmt.Println("All matches:")
	fmt.Println(allMatches)

	return allMatches, nil
}

func main() {
	apiKey := ""
	apiKeyID := ""
	partID := ""
	datasetID := ""
	maxMatches := 50

	logger := logging.NewDebugLogger("client")
	ctx := context.Background()
	viamClient, err := app.CreateViamClientWithAPIKey(
		ctx, app.Options{}, apiKey, apiKeyID, logger)
	if err != nil {
		logger.Fatal(err)
	}
	defer viamClient.Close()

	dataClient := viamClient.DataClient()


	fmt.Println("Fetching machine images...")
	binaryDataIDs, err := fetchBinaryDataIDs(ctx, dataClient, partID, maxMatches)
	if err != nil {
		fmt.Println("Error fetching machine images.")
		fmt.Printf("Exception: %v\n", err)
		return
	}
	fmt.Printf("Fetched %d machine images.\n", len(binaryDataIDs))

	fmt.Println("Adding machine images to dataset...")
	err = dataClient.AddBinaryDataToDatasetByIDs(
		ctx,
		binaryDataIDs,
		datasetID,
	)
	if err != nil {
		fmt.Println("Error adding machine images to dataset.")
		fmt.Printf("Exception: %v\n", err)
		return
	}
	fmt.Println("Machine images added to dataset successfully")
}

The following script adds all images captured from a certain machine to a new dataset:

import { createViamClient } from "@viamrobotics/sdk";

// Configuration constants – replace with your actual values
let API_KEY = "";  // API key, find or create in your organization settings
let API_KEY_ID = "";  // API key ID, find or create in your organization settings
let ORG_ID = "";  // your organization ID, find in your organization settings
let PART_ID = "";  // the part ID of the binary data you want to add to the dataset
let DATASET_ID = "";  // the ID of the dataset you want to add the image to
const MAX_MATCHES = 50;  // the maximum number of binary data objects to fetch

async function connect(): Promise<any> {
    // Establish a connection to the Viam client using API credentials
    return await createViamClient({
        credentials: {
            type: "api-key",
            authEntity: API_KEY_ID,
            payload: API_KEY,
        },
    });
}

async function fetchBinaryDataIds(dataClient: any, partId: string): Promise<string[]> {
    /** Fetch binary data metadata and return a list of BinaryData objects. */
    const dataFilter = { partId: partId };
    const allMatches: any[] = [];
    let last: string | undefined = undefined;

    console.log("Getting data for part...");

    while (allMatches.length < MAX_MATCHES) {
        console.log("Fetching more data...");
        const result = await dataClient.binaryDataByFilter(
            dataFilter,
            50,
            0,
            last,
            false  // includeBinary = false to allow limit > 1
        );

        const data = result.data || result;
        const newLast = result.last;

        if (!data || data.length === 0) {
            break;
        }
        allMatches.push(...data);
        last = newLast;
    }

    return allMatches;
}

async function main(): Promise<number> {
    const viamClient = await connect();
    const dataClient = viamClient.dataClient;

    const matchingData = await fetchBinaryDataIds(dataClient, PART_ID);
    console.log(`Found ${matchingData.length} matching data objects`);
    console.log(matchingData);

    await dataClient.addBinaryDataToDatasetByIds(
        matchingData.map(obj => obj.metadata.binaryDataId),
        DATASET_ID
    );

    console.log("Added files to dataset:");
    console.log(`https://app.viam.com/data/datasets?id=${DATASET_ID}`);

    return 0;
}

// Run the script
main().catch((error) => {
    console.error("Script failed:", error);
    process.exit(1);
});

Use an existing dataset

If you have used the viam dataset export command to export a dataset or if you’ve been given a dataset from someone else you can use the following script to import the dataset. If you have a dataset that was not exported with Viam, you will need to make changes to this script.

# Assumption: The dataset was exported using the `viam dataset export` command.
# This script is being run from the `destination` directory.

import asyncio
import os
import json

from viam.rpc.dial import DialOptions, Credentials
from viam.app.viam_client import ViamClient

# Configuration constants – replace with your actual values
API_KEY = ""  # API key, find or create in your organization settings
API_KEY_ID = ""  # API key ID, find or create in your organization settings
ORG_ID = ""  # the ID of the organization you want to add the image to
PART_ID = ""  # the ID of the machine part you want to add the image to
LOCATION_ID = ""  # the ID of the location you want to add the image to
DATASET_NAME = ""  # the name of the dataset you want to add the image to
FOLDER_NAME = ""  # the name of the folder that contains the dataset


async def connect() -> ViamClient:
    """Establish a connection to the Viam client using API credentials."""
    dial_options = DialOptions(
        credentials=Credentials(
            type="api-key",
            payload=API_KEY,
        ),
        auth_entity=API_KEY_ID
    )
    return await ViamClient.create_from_dial_options(dial_options)


async def main():
    viam_client = await connect()
    data_client = viam_client.data_client


    print("Creating dataset...")
    try:
        dataset_id = await data_client.create_dataset(
            name=DATASET_NAME,
            organization_id=ORG_ID,
        )
        print(f"Created dataset: {dataset_id}")
    except Exception as e:
        print("Error creating dataset. It may already exist.")
        print("See: https://app.viam.com/data/datasets")
        print(f"Exception: {e}")
        return 1

    file_ids = []

    for file_name in os.listdir(FOLDER_NAME + "/metadata/"):
        with open(FOLDER_NAME + "/metadata/" + file_name) as f:
            try:
                data = json.load(f)
            except Exception as e:
                print(f"Skipping file: {file_name} because it is not valid JSON")
                print(f"Exception: {e}")
                continue

            tags = None

            if "captureMetadata" in data.keys():
                if "tags" in data["captureMetadata"].keys():
                    tags = data["captureMetadata"]["tags"]

            annotations = None
            if "annotations" in data.keys():
                annotations = data["annotations"]
            print(data)
            print(annotations)

            image_file = file_name.replace(".json", "")

            print("Uploading: " + image_file)

            file_id = await data_client.file_upload_from_path(
                part_id=PART_ID,
                tags=tags,
                filepath=os.path.join(FOLDER_NAME + "/data/", image_file)
            )
            print("FileID: " + file_id)

            if annotations:
                bboxes = annotations["bboxes"]
                for box in bboxes:
                    await data_client.add_bounding_box_to_image_by_id(
                        binary_id=file_id,
                        label=box["label"],
                        x_min_normalized=box["xMinNormalized"],
                        y_min_normalized=box["yMinNormalized"],
                        x_max_normalized=box["xMaxNormalized"],
                        y_max_normalized=box["yMaxNormalized"]
                    )

            if tags:
                await data_client.add_tags_to_binary_data_by_ids(
                    tags=tags,
                    binary_ids=[file_id]
                )

            file_ids.append(file_id)

    await data_client.add_binary_data_to_dataset_by_ids(
        binary_ids=file_ids,
        dataset_id=dataset_id
    )
    print("Added files to dataset.")
    print("https://app.viam.com/data/datasets?id=" + dataset_id)


    viam_client.close()

if __name__ == '__main__':
    asyncio.run(main())
Looking for test datasets?

We have two datasets you can use for testing, one with shapes and the other with a wooden figure:

The shapes dataset. The datasets subtab of the data tab in the Viam app, showing a custom 'viam-figure' dataset of 25 images, most containing the wooden Viam figure
  1. Download the shapes dataset or download the wooden figure dataset.
  2. Unzip the download.
  3. Use the above script.