Aggregate Functions

Statistical and aggregate functions for data analysis across groups of rows.

Example Dataset

Using prices table: TICKER, DATE, PRICE

Basic Aggregate Functions

COUNT - Count Rows

-- Count all rows
SELECT COUNT(*) as total_rows
FROM prices;

-- Count non-NULL values
SELECT COUNT(PRICE) as price_count
FROM prices;

-- Count distinct values
SELECT COUNT(DISTINCT TICKER) as num_tickers
FROM prices;

-- Count by group
SELECT
    TICKER,
    COUNT(*) as num_records
FROM prices
GROUP BY TICKER;

SUM - Total Sum

-- Sum of all prices
SELECT SUM(PRICE) as total_price
FROM prices;

-- Sum by ticker
SELECT
    TICKER,
    SUM(PRICE) as total_price
FROM prices
GROUP BY TICKER;

-- Conditional sum
SELECT SUM(CASE WHEN PRICE > 100 THEN 1 ELSE 0 END) as high_price_count
FROM prices;

AVG - Average

-- Average price across all data
SELECT AVG(PRICE) as avg_price
FROM prices;

-- Average by ticker
SELECT
    TICKER,
    AVG(PRICE) as avg_price,
    COUNT(*) as num_records
FROM prices
GROUP BY TICKER;

-- Average for specific period
SELECT AVG(PRICE) as avg_price_2025
FROM prices
WHERE DATE >= '2025-01-01';

MIN and MAX - Minimum and Maximum

-- Overall min and max
SELECT
    MIN(PRICE) as min_price,
    MAX(PRICE) as max_price
FROM prices;

-- Min and max by ticker
SELECT
    TICKER,
    MIN(PRICE) as min_price,
    MAX(PRICE) as max_price,
    MAX(PRICE) - MIN(PRICE) as price_range
FROM prices
GROUP BY TICKER;

-- Min and max dates
SELECT
    TICKER,
    MIN(DATE) as start_date,
    MAX(DATE) as end_date
FROM prices
GROUP BY TICKER;

Statistical Functions

STD - Standard Deviation

-- Standard deviation of prices
SELECT
    TICKER,
    AVG(PRICE) as avg_price,
    STD(PRICE) as std_price,
    STD(PRICE) / AVG(PRICE) as coef_variation
FROM prices
GROUP BY TICKER;

-- Volatility calculation (std of returns)
UPDATE daily_return: PRICE / LAG(PRICE, 1) - 1
FROM prices
GROUP BY TICKER;

SELECT
    TICKER,
    STD(daily_return) as daily_volatility,
    STD(daily_return) * SQRT(252) as annualized_volatility
FROM prices
GROUP BY TICKER;

CORR - Correlation

-- Correlation between two series
-- First, create a table with two tickers
CREATE TABLE ticker_pair AS (
    SELECT
        a.DATE,
        a.PRICE as price_a,
        b.PRICE as price_b
    FROM (SELECT * FROM prices WHERE TICKER = 'AAPL') a
    INNER JOIN (SELECT * FROM prices WHERE TICKER = 'MSFT') b
        ON a.DATE = b.DATE
);

-- Calculate correlation
SELECT CORR(price_a, price_b) as correlation
FROM ticker_pair;

-- Returns correlation
UPDATE
    return_a: price_a / LAG(price_a, 1) - 1,
    return_b: price_b / LAG(price_b, 1) - 1
FROM ticker_pair;

SELECT CORR(return_a, return_b) as return_correlation
FROM ticker_pair;

COVAR - Covariance

-- Covariance between two series
-- Using the same ticker_pair table from CORR example above
SELECT COVAR(price_a, price_b) as price_covariance
FROM ticker_pair;

-- Calculate covariance of returns
UPDATE
    return_a: price_a / LAG(price_a, 1) - 1,
    return_b: price_b / LAG(price_b, 1) - 1
FROM ticker_pair;

SELECT COVAR(return_a, return_b) as return_covariance
FROM ticker_pair;

-- Covariance is related to correlation by:
-- COVAR(X,Y) = CORR(X,Y) * STD(X) * STD(Y)
SELECT
    COVAR(return_a, return_b) as covariance,
    CORR(return_a, return_b) as correlation,
    STD(return_a) as std_a,
    STD(return_b) as std_b
FROM ticker_pair;

MEDIAN - Median Value

-- Calculate median price for each ticker
SELECT
    TICKER,
    AVG(PRICE) as mean_price,
    MEDIAN(PRICE) as median_price,
    MIN(PRICE) as min_price,
    MAX(PRICE) as max_price
FROM prices
GROUP BY TICKER;

-- Compare mean vs median to detect skewness
SELECT
    TICKER,
    AVG(RETURN) as mean_return,
    MEDIAN(RETURN) as median_return,
    AVG(RETURN) - MEDIAN(RETURN) as skew_indicator
FROM returns
GROUP BY TICKER;

-- Median absolute deviation (MAD) for robust volatility
UPDATE abs_dev: ABS(PRICE - MEDIAN(PRICE))
FROM prices
GROUP BY TICKER;

SELECT
    TICKER,
    MEDIAN(abs_dev) as median_abs_deviation,
    MEDIAN(abs_dev) * 1.4826 as robust_std
FROM prices
GROUP BY TICKER;

Positional Aggregates

FIRST - First Value in Group

-- Get first price for each ticker
SELECT
    TICKER,
    FIRST(PRICE) as first_price,
    FIRST(DATE) as first_date
FROM prices
GROUP BY TICKER;

-- First price after specific date
SELECT
    TICKER,
    FIRST(PRICE) as first_price_2025
FROM prices
WHERE DATE >= '2025-01-01'
GROUP BY TICKER;

LAST - Last Value in Group

-- Get last price for each ticker
SELECT
    TICKER,
    LAST(PRICE) as last_price,
    LAST(DATE) as last_date
FROM prices
GROUP BY TICKER;

-- Period return using FIRST and LAST
SELECT
    TICKER,
    FIRST(PRICE) as start_price,
    LAST(PRICE) as end_price,
    LAST(PRICE) / FIRST(PRICE) - 1 as period_return
FROM prices
GROUP BY TICKER;

Combining FIRST and LAST

-- Comprehensive period analysis
SELECT
    TICKER,
    COUNT(*) as num_records,
    FIRST(DATE) as start_date,
    LAST(DATE) as end_date,
    FIRST(PRICE) as start_price,
    LAST(PRICE) as end_price,
    MIN(PRICE) as min_price,
    MAX(PRICE) as max_price,
    AVG(PRICE) as avg_price,
    STD(PRICE) as std_price,
    LAST(PRICE) / FIRST(PRICE) - 1 as total_return
FROM prices
GROUP BY TICKER;

Multiple Aggregations

Comprehensive Statistics

-- Complete statistical summary by ticker
SELECT
    TICKER,
    -- Count
    COUNT(*) as num_records,
    COUNT(DISTINCT DATE) as num_trading_days,

    -- Central tendency
    AVG(PRICE) as mean_price,

    -- Range
    MIN(PRICE) as min_price,
    MAX(PRICE) as max_price,
    MAX(PRICE) - MIN(PRICE) as range_price,

    -- Spread
    STD(PRICE) as std_price,
    STD(PRICE) / AVG(PRICE) as cv_price,

    -- Position
    FIRST(PRICE) as first_price,
    LAST(PRICE) as last_price,

    -- Performance
    LAST(PRICE) / FIRST(PRICE) - 1 as total_return
FROM prices
GROUP BY TICKER;

Time-Based Aggregations

-- Daily statistics across all tickers
SELECT
    DATE,
    COUNT(DISTINCT TICKER) as num_tickers,
    AVG(PRICE) as avg_price,
    MIN(PRICE) as min_price,
    MAX(PRICE) as max_price,
    STD(PRICE) as cross_sectional_std
FROM prices
GROUP BY DATE
ORDER BY DATE;

Aggregates with HAVING Clause

Filter groups based on aggregate results using HAVING.

-- Tickers with sufficient data
SELECT
    TICKER,
    COUNT(*) as num_records,
    AVG(PRICE) as avg_price
FROM prices
GROUP BY TICKER
HAVING COUNT(*) >= 100;

-- High volatility tickers
SELECT
    TICKER,
    STD(PRICE) as std_price,
    AVG(PRICE) as avg_price,
    STD(PRICE) / AVG(PRICE) as volatility_ratio
FROM prices
GROUP BY TICKER
HAVING STD(PRICE) / AVG(PRICE) > 0.2;

-- Tickers with large price movements
SELECT
    TICKER,
    FIRST(PRICE) as start_price,
    LAST(PRICE) as end_price,
    LAST(PRICE) / FIRST(PRICE) - 1 as total_return
FROM prices
GROUP BY TICKER
HAVING ABS(LAST(PRICE) / FIRST(PRICE) - 1) > 0.1;

Practical Examples

Example 1: Portfolio Analysis

-- Load price data
.load '/data/bt/20251208W/POLARIS1/data/prices.csv' as prices

-- Calculate returns
UPDATE daily_return: PRICE / LAG(PRICE, 1) - 1
FROM prices
GROUP BY TICKER;

-- Portfolio statistics
SELECT
    TICKER,
    COUNT(*) as num_days,

    -- Price statistics
    FIRST(PRICE) as start_price,
    LAST(PRICE) as end_price,
    AVG(PRICE) as avg_price,
    MIN(PRICE) as min_price,
    MAX(PRICE) as max_price,

    -- Return statistics
    AVG(daily_return) as avg_daily_return,
    STD(daily_return) as daily_volatility,
    AVG(daily_return) * 252 as annualized_return,
    STD(daily_return) * SQRT(252) as annualized_volatility,

    -- Sharpe ratio (assuming 0% risk-free rate)
    (AVG(daily_return) * 252) / (STD(daily_return) * SQRT(252)) as sharpe_ratio,

    -- Total return
    LAST(PRICE) / FIRST(PRICE) - 1 as total_return
FROM prices
GROUP BY TICKER
HAVING COUNT(*) >= 50  -- At least 50 trading days
ORDER BY sharpe_ratio DESC;

Example 2: Cross-Sectional Analysis

-- Daily cross-sectional statistics
SELECT
    DATE,
    COUNT(DISTINCT TICKER) as num_stocks,
    AVG(PRICE) as avg_price,
    STD(PRICE) as price_dispersion,
    MIN(PRICE) as min_price,
    MAX(PRICE) as max_price
FROM prices
GROUP BY DATE
ORDER BY DATE;

Example 3: Data Quality Check

-- Check data completeness
SELECT
    TICKER,
    COUNT(*) as num_records,
    COUNT(DISTINCT DATE) as num_unique_dates,
    COUNT(PRICE) as num_non_null_prices,
    COUNT(*) - COUNT(PRICE) as num_null_prices,
    MIN(DATE) as first_date,
    MAX(DATE) as last_date,
    MIN(PRICE) as min_price,
    MAX(PRICE) as max_price
FROM prices
GROUP BY TICKER
HAVING COUNT(*) - COUNT(PRICE) > 0  -- Has NULL values
    OR COUNT(*) < 100  -- Insufficient data
ORDER BY num_records;

Example 4: Python API with Aggregates

import avapy

# Load data
avapy.asql(".load '/data/prices.csv' as prices")

# Calculate returns
avapy.asql("""
    UPDATE daily_return: PRICE / LAG(PRICE, 1) - 1
    FROM prices
    GROUP BY TICKER
""")

# Get summary statistics
summary = avapy.asql("""
    SELECT
        TICKER,
        COUNT(*) as num_records,
        AVG(PRICE) as avg_price,
        STD(PRICE) as std_price,
        AVG(daily_return) as avg_return,
        STD(daily_return) as volatility,
        LAST(PRICE) / FIRST(PRICE) - 1 as total_return
    FROM prices
    GROUP BY TICKER
    HAVING COUNT(*) >= 50
    ORDER BY total_return DESC
""")

print(summary)