Data Operations
Loading, saving, compressing, and managing data in AVA Database.
Loading Data
AVA Database supports multiple methods for loading data from various file formats.
Loading CSV Files
Using SQL LOAD Command
-- Load CSV with automatic schema detection
LOAD TABLE sales_data FROM 'data/sales.csv';
-- Load CSV with custom delimiter
LOAD TABLE products FROM 'data/products.csv'
DELIMITER '|'
HEADER TRUE;
-- Load CSV with explicit schema
LOAD TABLE customers FROM 'data/customers.csv'
SCHEMA (
customer_id INT,
name VARCHAR(100),
email VARCHAR(100),
created_at DATETIME
);
-- Load multiple CSV files (pattern matching)
LOAD TABLE logs FROM 'data/logs_*.csv'
PARALLEL 4;
Using Python API
import avapy
# Load CSV file into table
avapy.LoadTable("data/sales.csv", "sales_data")
# Load CSV with options
avapy.LoadTable(
"data/products.csv",
"products",
delimiter="|",
header=True,
compression="gzip"
)
# Load from pandas DataFrame
import pandas as pd
df = pd.read_csv("data/customers.csv")
avapy.LoadFromDataFrame(df, "customers")
Loading Binary Files
AVA's native binary format provides faster loading and automatic compression.
Using SQL LOAD Command
-- Load AVA binary format
LOAD TABLE sales_data FROM 'data/sales.ava';
-- Load compressed binary
LOAD TABLE large_dataset FROM 'data/archive.ava.lzma'
FORMAT BINARY
COMPRESSED TRUE;
-- Load Parquet files
LOAD TABLE analytics FROM 'data/metrics.parquet'
FORMAT PARQUET;
Using Python API
import avapy
# Load AVA binary format
avapy.LoadBinary("data/sales.ava", "sales_data")
# Load compressed binary
avapy.LoadBinary(
"data/archive.ava.lzma",
"large_dataset",
compressed=True
)
# Load Parquet file
avapy.LoadParquet("data/metrics.parquet", "analytics")
Saving Data
Export data from AVA Database to various file formats.
Saving to CSV
Using SQL SAVE Command
-- Save entire table to CSV
SAVE TABLE sales_data TO 'export/sales.csv';
-- Save with custom options
SAVE TABLE products TO 'export/products.csv'
DELIMITER '|'
HEADER TRUE
QUOTE '"';
-- Save query results to CSV
SAVE (
SELECT * FROM sales_data
WHERE date >= '2025-01-01'
) TO 'export/sales_2025.csv';
-- Save with compression
SAVE TABLE large_dataset TO 'export/data.csv.gz'
COMPRESSED TRUE;
Using Python API
import avapy
# Save table to CSV
avapy.SaveTable("sales_data", "export/sales.csv")
# Save with options
avapy.SaveTable(
"products",
"export/products.csv",
delimiter="|",
header=True,
compression="gzip"
)
# Save query results
result = avapy.asql("SELECT * FROM sales_data WHERE date >= '2025-01-01'")
avapy.SaveToCSV(result, "export/sales_2025.csv")
Saving to Binary Format
Using SQL SAVE Command
-- Save to AVA binary format (fastest)
SAVE TABLE sales_data TO 'data/sales.ava'
FORMAT BINARY;
-- Save with compression
SAVE TABLE large_dataset TO 'data/archive.ava'
FORMAT BINARY
COMPRESSED TRUE
COMPRESSION_LEVEL 6;
-- Save to Parquet format
SAVE TABLE analytics TO 'data/metrics.parquet'
FORMAT PARQUET;
Using Python API
import avapy
# Save to AVA binary format
avapy.SaveBinary("sales_data", "data/sales.ava")
# Save with compression
avapy.SaveBinary(
"large_dataset",
"data/archive.ava",
compressed=True,
compression_level=6
)
# Save to Parquet
avapy.SaveParquet("analytics", "data/metrics.parquet")
In-Memory Compression
Compress and decompress data in memory for efficient processing and storage.
Compression Steps
Enable Compression for Table
-- Enable LZMA compression
ALTER TABLE large_table SET COMPRESSION = 'LZMA';
-- Set compression level (1-9, default 6)
ALTER TABLE large_table SET COMPRESSION_LEVEL = 9;
Compress Data in Memory
-- Manually compress table in memory
COMPRESS TABLE large_table;
-- Compress specific columns
COMPRESS TABLE logs COLUMNS (message, stack_trace);
Data is compressed in blocks (default 1MB) and stored in columnar format in memory.
Query Compressed Data
-- Query works transparently on compressed data
SELECT customer_id, SUM(amount) as total
FROM large_table
WHERE date >= '2025-01-01'
GROUP BY customer_id;
AVA automatically decompresses only the needed blocks during query execution.
Decompress Data (Optional)
-- Manually decompress table
DECOMPRESS TABLE large_table;
-- Check compression status
SHOW TABLE STATUS LIKE 'large_table';
Using Python API
import avapy
# Enable compression for table
avapy.asql("ALTER TABLE large_table SET COMPRESSION = 'LZMA'")
avapy.asql("ALTER TABLE large_table SET COMPRESSION_LEVEL = 6")
# Compress table in memory
avapy.CompressTable("large_table")
# Compress with options
avapy.CompressTable(
"large_table",
columns=["message", "stack_trace"],
compression_level=9
)
# Check compression ratio
stats = avapy.GetTableStats("large_table")
print(f"Compression ratio: {stats['compression_ratio']}")
print(f"Memory saved: {stats['memory_saved_mb']} MB")
# Decompress table
avapy.DecompressTable("large_table")
Distributed Storage
Save and load data across multiple files for distributed processing and parallel I/O.
Saving to Distributed Files
Using SQL SAVE Command
-- Save table to multiple files (partitioned by rows)
SAVE TABLE large_dataset TO 'data/distributed/part_*.ava'
DISTRIBUTED
PARTITIONS 8;
-- Partition by date column
SAVE TABLE sales_data TO 'data/sales/year=*/month=*/'
PARTITION BY (YEAR(date), MONTH(date))
FORMAT BINARY;
-- Distribute with compression
SAVE TABLE logs TO 'data/logs/part_*.ava'
DISTRIBUTED
PARTITIONS 16
COMPRESSED TRUE
COMPRESSION_LEVEL 6;
-- Custom partition size
SAVE TABLE events TO 'data/events/chunk_*.ava'
DISTRIBUTED
PARTITION_SIZE '500MB';
Using Python API
import avapy
# Save to distributed files
avapy.SaveDistributed(
table="large_dataset",
path_pattern="data/distributed/part_*.ava",
num_partitions=8
)
# Partition by column values
avapy.SaveDistributed(
table="sales_data",
path_pattern="data/sales/year={year}/month={month}/",
partition_by=["year", "month"]
)
# With compression and custom options
avapy.SaveDistributed(
table="logs",
path_pattern="data/logs/part_*.ava",
num_partitions=16,
compressed=True,
compression_level=6,
partition_size="500MB"
)
Loading from Distributed Files
Using SQL LOAD Command
-- Load from multiple files (automatic parallel loading)
LOAD TABLE large_dataset FROM 'data/distributed/part_*.ava'
DISTRIBUTED;
-- Load partitioned data
LOAD TABLE sales_data FROM 'data/sales/year=*/month=*/'
DISTRIBUTED
FORMAT BINARY;
-- Load with parallel workers
LOAD TABLE logs FROM 'data/logs/part_*.ava'
DISTRIBUTED
PARALLEL 8;
-- Load specific partitions
LOAD TABLE sales_2025 FROM 'data/sales/year=2025/month=*/'
DISTRIBUTED;
Using Python API
import avapy
# Load from distributed files
avapy.LoadDistributed(
path_pattern="data/distributed/part_*.ava",
table="large_dataset"
)
# Load with parallel workers
avapy.LoadDistributed(
path_pattern="data/logs/part_*.ava",
table="logs",
parallel=8
)
# Load specific partitions
avapy.LoadDistributed(
path_pattern="data/sales/year=2025/month=*/",
table="sales_2025"
)
Benefits of Distributed Storage
Best Practices
Use Binary Format for Large Datasets
AVA's native binary format (.ava) is 3-5x faster than CSV for both loading and saving.
Enable Compression for Archival Data
Historical data that's read infrequently benefits most from compression (levels 7-9).
Use Distributed Storage for Very Large Tables
Split tables larger than 10GB into 500MB-1GB partitions for optimal parallel I/O.
Partition by Query Patterns
Partition by columns commonly used in WHERE clauses (e.g., date, region) for faster queries.