API Summary
Summary of the most important Deep Lake commands.
Deep Lake API Basics
Import and Installation
By default, Deep Lake does not install dependencies for audio, video, google-cloud, and other features. Details on installation options are available here.
!pip3 install deeplake
import deeplakeLoading Deep Lake Datasets
Deep Lake datasets can be stored at a variety of storage locations using the appropriate dataset_path parameter below. We support S3, GCS, Activeloop storage, and are constantly adding to the list.
# Load a Deep Lake Dataset
ds = deeplake.load('dataset_path', creds = {'optional'}, token = 'optional')Creating Deep Lake Datasets
# Create an empty Deep Lake dataset
ds = deeplake.empty('dataset_path', creds = {'optional'}, token = 'optional')
# Create an Deep Lake Dataset with the same tensors as another dataset
ds = deeplake.like(ds_object or 'dataset_path', creds = {'optional'}, token = 'optional')
# Automatically create a Deep Lake Dataset from another data source
ds = deeplake.ingest('source_folder', 'deeplake_dataset_path', ... 'see API reference for details')
ds = deeplake.ingest_coco('images_folder', 'annotations.json', 'deeplake_dataset_path', ... 'see API reference for details')Deleting Datasets
ds.delete()
deeplake.delete('dataset_path', creds = {'optional'}, token = 'optional', token = 'optional')Creating Tensors
# Specifying htype is recommended for maximizing performance.
ds.create_tensor('my_tensor', htype = 'bbox')
# Specifiying the correct compression is critical for images, videos, audio and 
# other rich data types. 
ds.create_tensor('songs', htype = 'audio', sample_compression = 'mp3')Creating Tensor Hierarchies
ds.create_group('my_group')
ds.my_group.create_tensor('my_tensor')
ds.create_tensor('my_group/my_tensor') #Automatically creates the group 'my_group'Visualizing and Inspecting Datasets
ds.visualize()
ds.summary()Appending Data to Datasets
ds.append({'tensor_1': np.ones((1,4)), 'tensor_2': deeplake.read('image.jpg')})
ds.my_group.append({'tensor_1': np.ones((1,4)), 'tensor_2': deeplake.read('image.jpg')})Appending/Updating Data in Individual Tensors
# Append a single sample
ds.my_tensor.append(np.ones((1,4)))
ds.my_tensor.append(deeplake.read('image.jpg'))
# Append multiple samples. The first axis in the 
# numpy array is assumed to be the sample axis for the tensor
ds.my_tensor.extend(np.ones((5,1,4)))
# Editing or adding data at a specific index
ds.my_tensor[i] = deeplake.read('image.jpg')
# Removing samples by index
ds.pop[i]Appending Empty Samples or Skipping Samples
# Data appended as None will be returned as an empty array
ds.append('tensor_1': deeplake.read(...), 'tensor_2': None)
ds.my_tensor.append(None)
# Empty arrays can be explicitly appended if the length of the shape 
# of the empty array matches that of the other samples
ds.boxes.append(np.zeros((0,4))Accessing Tensor Data
# Read the i-th tensor sample
np_array = ds.my_tensor[i].numpy()
text = ds.my_text_tensor[i].data() # More comprehensive view of the data
bytes = ds.my_tensor[i].tobytes() # More comprehensive view of the data
# Read the i-th dataset sample as a numpy array
image = ds[i].images.numpy()
# Read the i-th labels as a numpy array or list of strings
labels_array = ds.labels[i].numpy()
labels_array = ds.labels[i].data()['value'] # same as .numpy()
labels_string_list = ds.labels[i].data()['text']
# Read a tensor sample from a hierarchical group
np_array = ds.my_group.my_tensor_1[i].numpy()
np_array = ds.my_group.my_tensor_2[i].numpy()
# Read multiple tensor samples into numpy array
np_array = ds.my_tensor[0:10].numpy() 
# Read multiple tensor samples into a list of numpy arrays
np_array_list = ds.my_tensor[0:10].numpy(aslist=True)Maximizing performance
Make sure to use the with context when making any updates to datasets. 
with ds:
    ds.create_tensor('my_tensor')
    
    for i in range(10):
        ds.my_tensor.append(i)Connecting Deep Lake Datasets to ML Frameworks
# PyTorch Dataloader
dataloader = ds.pytorch(batch_size = 16, transform = {'images': torchvision_tform, 'labels': None}, num_workers = 2, scheduler = 'threaded')
# TensorFlow Dataset
ds_tensorflow = ds.tensorflow()
# Enterprise Dataloader
dataloader = ds.dataloader().batch(batch_size = 64).pytorch(num_workers = 8)Versioning Datasets
# Commit data
commit_id = ds.commit('Added 100 images of trucks')
# Print the commit log
log = ds.log()
# Checkout a branch or commit 
ds.checkout('branch_name' or commit_id)
# Create a new branch
ds.checkout('new_branch', create = True)
# Examine differences between commits
ds.diff()Querying Datasets and Saving Dataset Views
view = ds.query('Select * where contains(labels, 'giraffe')
view.save_view(optimize = True)
view = ds.load_view(id = 'query_id')Adding Tensor and Dataset-Level Metadata
# Add or update dataset metadata
ds.info.update(key1 = 'text', key2 = number)
# Also can run ds.info.update({'key1'='value1', 'key2' = num_value})
# Add or update tensor metadata
ds.my_tensor.info.update(key1 = 'text', key2 = number)
# Delete metadata
ds.info.delete() #Delete all metadata
ds.info.delete('key1') #Delete 1 key in metadataCopying datasets
# Fastest option - copies everything including version history
ds = deeplake.deepcopy('src_dataset_path', 'dest_dataset_path', src_creds, dest_creds, src_token, dest_token)
# Slower option - copies only data on the last commit
ds = deeplake.copy('src_dataset_path', 'dest_dataset_path', src_creds, dest_creds, src_token, dest_token)Advanced
# Load a Deep Lake Dataset if it already exists (same as deeplake.load), or initialize 
# a new Deep Lake Dataset if it does not already exist (same as deeplake.empty)
ds = deeplake.dataset('dataset_path', creds = {'optional'}, token = 'optional')
# Append multiple samples using a list
ds.my_tensor.extend([np.ones((1,4)), np.ones((3,4)), np.ones((2,4)
# Fetch adjacent data in the chunk -> Increases speed when loading 
# sequantially or if a tensor's data fits in the cache.
numeric_label = ds.labels[i].numpy(fetch_chunks = True)Last updated
Was this helpful?