Aggregate Functions
Statistical and aggregate functions for data analysis across groups of rows.
Example Dataset
Using prices table: TICKER, DATE, PRICE
Basic Aggregate Functions
COUNT - Count Rows
-- Count all rows
SELECT COUNT(*) as total_rows
FROM prices;
-- Count non-NULL values
SELECT COUNT(PRICE) as price_count
FROM prices;
-- Count distinct values
SELECT COUNT(DISTINCT TICKER) as num_tickers
FROM prices;
-- Count by group
SELECT
TICKER,
COUNT(*) as num_records
FROM prices
GROUP BY TICKER;
SUM - Total Sum
-- Sum of all prices
SELECT SUM(PRICE) as total_price
FROM prices;
-- Sum by ticker
SELECT
TICKER,
SUM(PRICE) as total_price
FROM prices
GROUP BY TICKER;
-- Conditional sum
SELECT SUM(CASE WHEN PRICE > 100 THEN 1 ELSE 0 END) as high_price_count
FROM prices;
AVG - Average
-- Average price across all data
SELECT AVG(PRICE) as avg_price
FROM prices;
-- Average by ticker
SELECT
TICKER,
AVG(PRICE) as avg_price,
COUNT(*) as num_records
FROM prices
GROUP BY TICKER;
-- Average for specific period
SELECT AVG(PRICE) as avg_price_2025
FROM prices
WHERE DATE >= '2025-01-01';
MIN and MAX - Minimum and Maximum
-- Overall min and max
SELECT
MIN(PRICE) as min_price,
MAX(PRICE) as max_price
FROM prices;
-- Min and max by ticker
SELECT
TICKER,
MIN(PRICE) as min_price,
MAX(PRICE) as max_price,
MAX(PRICE) - MIN(PRICE) as price_range
FROM prices
GROUP BY TICKER;
-- Min and max dates
SELECT
TICKER,
MIN(DATE) as start_date,
MAX(DATE) as end_date
FROM prices
GROUP BY TICKER;
Statistical Functions
STD - Standard Deviation
-- Standard deviation of prices
SELECT
TICKER,
AVG(PRICE) as avg_price,
STD(PRICE) as std_price,
STD(PRICE) / AVG(PRICE) as coef_variation
FROM prices
GROUP BY TICKER;
-- Volatility calculation (std of returns)
UPDATE daily_return: PRICE / LAG(PRICE, 1) - 1
FROM prices
GROUP BY TICKER;
SELECT
TICKER,
STD(daily_return) as daily_volatility,
STD(daily_return) * SQRT(252) as annualized_volatility
FROM prices
GROUP BY TICKER;
CORR - Correlation
-- Correlation between two series
-- First, create a table with two tickers
CREATE TABLE ticker_pair AS (
SELECT
a.DATE,
a.PRICE as price_a,
b.PRICE as price_b
FROM (SELECT * FROM prices WHERE TICKER = 'AAPL') a
INNER JOIN (SELECT * FROM prices WHERE TICKER = 'MSFT') b
ON a.DATE = b.DATE
);
-- Calculate correlation
SELECT CORR(price_a, price_b) as correlation
FROM ticker_pair;
-- Returns correlation
UPDATE
return_a: price_a / LAG(price_a, 1) - 1,
return_b: price_b / LAG(price_b, 1) - 1
FROM ticker_pair;
SELECT CORR(return_a, return_b) as return_correlation
FROM ticker_pair;
COVAR - Covariance
-- Covariance between two series
-- Using the same ticker_pair table from CORR example above
SELECT COVAR(price_a, price_b) as price_covariance
FROM ticker_pair;
-- Calculate covariance of returns
UPDATE
return_a: price_a / LAG(price_a, 1) - 1,
return_b: price_b / LAG(price_b, 1) - 1
FROM ticker_pair;
SELECT COVAR(return_a, return_b) as return_covariance
FROM ticker_pair;
-- Covariance is related to correlation by:
-- COVAR(X,Y) = CORR(X,Y) * STD(X) * STD(Y)
SELECT
COVAR(return_a, return_b) as covariance,
CORR(return_a, return_b) as correlation,
STD(return_a) as std_a,
STD(return_b) as std_b
FROM ticker_pair;
MEDIAN - Median Value
-- Calculate median price for each ticker
SELECT
TICKER,
AVG(PRICE) as mean_price,
MEDIAN(PRICE) as median_price,
MIN(PRICE) as min_price,
MAX(PRICE) as max_price
FROM prices
GROUP BY TICKER;
-- Compare mean vs median to detect skewness
SELECT
TICKER,
AVG(RETURN) as mean_return,
MEDIAN(RETURN) as median_return,
AVG(RETURN) - MEDIAN(RETURN) as skew_indicator
FROM returns
GROUP BY TICKER;
-- Median absolute deviation (MAD) for robust volatility
UPDATE abs_dev: ABS(PRICE - MEDIAN(PRICE))
FROM prices
GROUP BY TICKER;
SELECT
TICKER,
MEDIAN(abs_dev) as median_abs_deviation,
MEDIAN(abs_dev) * 1.4826 as robust_std
FROM prices
GROUP BY TICKER;
Positional Aggregates
FIRST - First Value in Group
-- Get first price for each ticker
SELECT
TICKER,
FIRST(PRICE) as first_price,
FIRST(DATE) as first_date
FROM prices
GROUP BY TICKER;
-- First price after specific date
SELECT
TICKER,
FIRST(PRICE) as first_price_2025
FROM prices
WHERE DATE >= '2025-01-01'
GROUP BY TICKER;
LAST - Last Value in Group
-- Get last price for each ticker
SELECT
TICKER,
LAST(PRICE) as last_price,
LAST(DATE) as last_date
FROM prices
GROUP BY TICKER;
-- Period return using FIRST and LAST
SELECT
TICKER,
FIRST(PRICE) as start_price,
LAST(PRICE) as end_price,
LAST(PRICE) / FIRST(PRICE) - 1 as period_return
FROM prices
GROUP BY TICKER;
Combining FIRST and LAST
-- Comprehensive period analysis
SELECT
TICKER,
COUNT(*) as num_records,
FIRST(DATE) as start_date,
LAST(DATE) as end_date,
FIRST(PRICE) as start_price,
LAST(PRICE) as end_price,
MIN(PRICE) as min_price,
MAX(PRICE) as max_price,
AVG(PRICE) as avg_price,
STD(PRICE) as std_price,
LAST(PRICE) / FIRST(PRICE) - 1 as total_return
FROM prices
GROUP BY TICKER;
Multiple Aggregations
Comprehensive Statistics
-- Complete statistical summary by ticker
SELECT
TICKER,
-- Count
COUNT(*) as num_records,
COUNT(DISTINCT DATE) as num_trading_days,
-- Central tendency
AVG(PRICE) as mean_price,
-- Range
MIN(PRICE) as min_price,
MAX(PRICE) as max_price,
MAX(PRICE) - MIN(PRICE) as range_price,
-- Spread
STD(PRICE) as std_price,
STD(PRICE) / AVG(PRICE) as cv_price,
-- Position
FIRST(PRICE) as first_price,
LAST(PRICE) as last_price,
-- Performance
LAST(PRICE) / FIRST(PRICE) - 1 as total_return
FROM prices
GROUP BY TICKER;
Time-Based Aggregations
-- Daily statistics across all tickers
SELECT
DATE,
COUNT(DISTINCT TICKER) as num_tickers,
AVG(PRICE) as avg_price,
MIN(PRICE) as min_price,
MAX(PRICE) as max_price,
STD(PRICE) as cross_sectional_std
FROM prices
GROUP BY DATE
ORDER BY DATE;
Aggregates with HAVING Clause
Filter groups based on aggregate results using HAVING.
-- Tickers with sufficient data
SELECT
TICKER,
COUNT(*) as num_records,
AVG(PRICE) as avg_price
FROM prices
GROUP BY TICKER
HAVING COUNT(*) >= 100;
-- High volatility tickers
SELECT
TICKER,
STD(PRICE) as std_price,
AVG(PRICE) as avg_price,
STD(PRICE) / AVG(PRICE) as volatility_ratio
FROM prices
GROUP BY TICKER
HAVING STD(PRICE) / AVG(PRICE) > 0.2;
-- Tickers with large price movements
SELECT
TICKER,
FIRST(PRICE) as start_price,
LAST(PRICE) as end_price,
LAST(PRICE) / FIRST(PRICE) - 1 as total_return
FROM prices
GROUP BY TICKER
HAVING ABS(LAST(PRICE) / FIRST(PRICE) - 1) > 0.1;
Practical Examples
Example 1: Portfolio Analysis
-- Load price data
.load '/data/bt/20251208W/POLARIS1/data/prices.csv' as prices
-- Calculate returns
UPDATE daily_return: PRICE / LAG(PRICE, 1) - 1
FROM prices
GROUP BY TICKER;
-- Portfolio statistics
SELECT
TICKER,
COUNT(*) as num_days,
-- Price statistics
FIRST(PRICE) as start_price,
LAST(PRICE) as end_price,
AVG(PRICE) as avg_price,
MIN(PRICE) as min_price,
MAX(PRICE) as max_price,
-- Return statistics
AVG(daily_return) as avg_daily_return,
STD(daily_return) as daily_volatility,
AVG(daily_return) * 252 as annualized_return,
STD(daily_return) * SQRT(252) as annualized_volatility,
-- Sharpe ratio (assuming 0% risk-free rate)
(AVG(daily_return) * 252) / (STD(daily_return) * SQRT(252)) as sharpe_ratio,
-- Total return
LAST(PRICE) / FIRST(PRICE) - 1 as total_return
FROM prices
GROUP BY TICKER
HAVING COUNT(*) >= 50 -- At least 50 trading days
ORDER BY sharpe_ratio DESC;
Example 2: Cross-Sectional Analysis
-- Daily cross-sectional statistics
SELECT
DATE,
COUNT(DISTINCT TICKER) as num_stocks,
AVG(PRICE) as avg_price,
STD(PRICE) as price_dispersion,
MIN(PRICE) as min_price,
MAX(PRICE) as max_price
FROM prices
GROUP BY DATE
ORDER BY DATE;
Example 3: Data Quality Check
-- Check data completeness
SELECT
TICKER,
COUNT(*) as num_records,
COUNT(DISTINCT DATE) as num_unique_dates,
COUNT(PRICE) as num_non_null_prices,
COUNT(*) - COUNT(PRICE) as num_null_prices,
MIN(DATE) as first_date,
MAX(DATE) as last_date,
MIN(PRICE) as min_price,
MAX(PRICE) as max_price
FROM prices
GROUP BY TICKER
HAVING COUNT(*) - COUNT(PRICE) > 0 -- Has NULL values
OR COUNT(*) < 100 -- Insufficient data
ORDER BY num_records;
Example 4: Python API with Aggregates
import avapy
# Load data
avapy.asql(".load '/data/prices.csv' as prices")
# Calculate returns
avapy.asql("""
UPDATE daily_return: PRICE / LAG(PRICE, 1) - 1
FROM prices
GROUP BY TICKER
""")
# Get summary statistics
summary = avapy.asql("""
SELECT
TICKER,
COUNT(*) as num_records,
AVG(PRICE) as avg_price,
STD(PRICE) as std_price,
AVG(daily_return) as avg_return,
STD(daily_return) as volatility,
LAST(PRICE) / FIRST(PRICE) - 1 as total_return
FROM prices
GROUP BY TICKER
HAVING COUNT(*) >= 50
ORDER BY total_return DESC
""")
print(summary)