Speed up Postgres Query and Function



  • I have a tables that looks like this:

    create_table "cboe_raws", force: :cascade do |t|
    t.date "quote_date", null: false
    t.string "underlying_symbol", null: false
    t.string "security_type", null: false
    t.string "option_symbol", null: false
    t.date "expiration_date", null: false
    t.float "strike_price", default: 0.0, null: false
    t.string "call_put_flag", null: false
    t.integer "days_to_expire", default: 0, null: false
    t.string "series_type", null: false
    t.float "first_trade_price", default: 0.0, null: false
    t.float "high_trade_price", default: 0.0, null: false
    t.float "low_trade_price", default: 0.0, null: false
    t.float "last_trade_price", default: 0.0, null: false
    t.integer "total_exchange_vol", default: 0, null: false
    t.integer "total_industry_vol", default: 0, null: false
    t.integer "open_interest", default: 0, null: false
    t.integer "previous_open_interest", default: 0, null: false
    t.float "previous_close", default: 0.0, null: false
    t.integer "firm_open_buy_qty", default: 0, null: false
    t.integer "firm_open_buy_vol", default: 0, null: false
    t.integer "firm_close_buy_qty", default: 0, null: false
    t.integer "firm_close_buy_vol", default: 0, null: false
    t.integer "firm_open_sell_qty", default: 0, null: false
    t.integer "firm_open_sell_vol", default: 0, null: false
    t.integer "firm_close_sell_qty", default: 0, null: false
    t.integer "firm_close_sell_vol", default: 0, null: false
    t.integer "bd_open_buy_qty", default: 0, null: false
    t.integer "bd_open_buy_vol", default: 0, null: false
    t.integer "bd_close_buy_qty", default: 0, null: false
    t.integer "bd_close_buy_vol", default: 0, null: false
    t.integer "bd_open_sell_qty", default: 0, null: false
    t.integer "bd_open_sell_vol", default: 0, null: false
    t.integer "bd_close_sell_qty", default: 0, null: false
    t.integer "bd_close_sell_vol", default: 0, null: false
    t.integer "mm_buy_qty", default: 0, null: false
    t.integer "mm_buy_vol", default: 0, null: false
    t.integer "mm_sell_qty", default: 0, null: false
    t.integer "mm_sell_vol", default: 0, null: false
    t.integer "cust_lt_100_open_buy_qty", default: 0, null: false
    t.integer "cust_lt_100_open_buy_vol", default: 0, null: false
    t.integer "cust_lt_100_close_buy_qty", default: 0, null: false
    t.integer "cust_lt_100_close_buy_vol", default: 0, null: false
    t.integer "cust_lt_100_open_sell_qty", default: 0, null: false
    t.integer "cust_lt_100_open_sell_vol", default: 0, null: false
    t.integer "cust_lt_100_close_sell_qty", default: 0, null: false
    t.integer "cust_lt_100_close_sell_vol", default: 0, null: false
    t.integer "cust_100_199_open_buy_qty", default: 0, null: false
    t.integer "cust_100_199_open_buy_vol", default: 0, null: false
    t.integer "cust_gt_199_clsoe_buy_qty", default: 0, null: false
    t.integer "cust_100_199_close_buy_vol", default: 0, null: false
    t.integer "cust_100_199_open_sell_qty", default: 0, null: false
    t.integer "cust_100_199_open_sell_vol", default: 0, null: false
    t.integer "cust_100_199_close_sell_qty", default: 0, null: false
    t.integer "cust_100_199_close_sell_vol", default: 0, null: false
    t.integer "cust_gt_199_open_buy_qty", default: 0, null: false
    t.integer "cust_gt_199_open_buy_vol", default: 0, null: false
    t.integer "cust_gt_199_close_buy_qty", default: 0, null: false
    t.integer "cust_gt_199_close_buy_vol", default: 0, null: false
    t.integer "cust_gt_199_open_sell_qty", default: 0, null: false
    t.integer "cust_gt_199_open_sell_vol", default: 0, null: false
    t.integer "cust_gt_199_close_sell_qty", default: 0, null: false
    t.integer "cust_gt_199_close_sell_vol", default: 0, null: false
    t.integer "procust_lt_100_open_buy_qty", default: 0, null: false
    t.integer "procust_lt_100_open_buy_vol", default: 0, null: false
    t.integer "procust_lt_100_close_buy_qty", default: 0, null: false
    t.integer "procust_lt_100_close_buy_vol", default: 0, null: false
    t.integer "procust_lt_100_open_sell_qty", default: 0, null: false
    t.integer "procust_lt_100_open_sell_vol", default: 0, null: false
    t.integer "procust_lt_100_close_sell_qty", default: 0, null: false
    t.integer "procust_lt_100_close_sell_vol", default: 0, null: false
    t.integer "procust_100_199_open_buy_qty", default: 0, null: false
    t.integer "procust_100_199_open_buy_vol", default: 0, null: false
    t.integer "procust_100_199_close_buy_qty", default: 0, null: false
    t.integer "procust_100_199_close_buy_vol", default: 0, null: false
    t.integer "procust_100_199_open_sell_qty", default: 0, null: false
    t.integer "procust_100_199_open_sell_vol", default: 0, null: false
    t.integer "procust_100_199_close_sell_qty", default: 0, null: false
    t.integer "procust_100_199_close_sell_vol", default: 0, null: false
    t.integer "procust_gt_199_open_buy_qty", default: 0, null: false
    t.integer "procust_gt_199_open_buy_vol", default: 0, null: false
    t.integer "procust_gt_199_close_buy_vol", default: 0, null: false
    t.integer "procust_gt_199_open_sell_qty", default: 0, null: false
    t.integer "procust_gt_199_open_sell_vol", default: 0, null: false
    t.integer "procust_gt_199_close_sell_qty", default: 0, null: false
    t.integer "procust_gt_199_close_sell_vol", default: 0, null: false
    t.boolean "active", default: true, null: false
    t.datetime "created_at", null: false
    t.datetime "updated_at", null: false
    t.integer "cust_100_199_close_buy_qty"
    t.integer "procust_gt_199_close_buy_qty"
    t.index ["active"], name: "index_cboe_raws_on_active"
    t.index ["call_put_flag"], name: "index_cboe_raws_on_call_put_flag"
    t.index ["expiration_date"], name: "index_cboe_raws_on_expiration_date"
    t.index ["option_symbol"], name: "index_cboe_raws_on_option_symbol"
    t.index ["quote_date"], name: "index_cboe_raws_on_quote_date"
    t.index ["strike_price"], name: "index_cboe_raws_on_strike_price"
    

    I then ran a migration to add indexes:

    class IndexTable < ActiveRecord::Migration[7.0]
      def change
        add_index :cboe_raws, :expiration_date
        add_index :cboe_raws, :call_put_flag
        add_index :cboe_raws, :quote_date
        add_index :cboe_raws, :active
        add_index :cboe_raws, :option_symbol
        add_index :cboe_raws, :strike_price
      end
    end
    

    I then created a DB function as follows:

    CREATE OR REPLACE FUNCTION market_maker_books (symbol VARCHAR)
        RETURNS TABLE (
            contract_underlying_symbol VARCHAR,
            contract_expiration_date DATE,
            contract_strike_price DOUBLE PRECISION,
            contract_call_put_flag VARCHAR,
            contract_mm_buy_vol_sum BIGINT,
            contract_mm_sell_vol_sum BIGINT,
            contract_buys_minus_sells BIGINT
        )
        AS $$
        BEGIN
            RETURN QUERY SELECT underlying_symbol,
            expiration_date,
            strike_price,
            call_put_flag,
            SUM(mm_buy_vol) AS mm_buy_vol_sum,
            SUM(mm_sell_vol) AS mm_sell_vol_sum,
            SUM(mm_buy_vol) - SUM(mm_sell_vol) AS buys_minus_sells
          FROM cboe_raws
          WHERE underlying_symbol = symbol AND expiration_date >= CURRENT_DATE
          GROUP BY underlying_symbol, expiration_date, strike_price, call_put_flag
          ORDER BY expiration_date, call_put_flag, strike_price;
        END; $$
        LANGUAGE 'plpgsql';
    

    Calling the function above takes about 18-19 seconds on a databse with 8.6 million rows. I need that to be much much faster, because this database is only going to grow.

    Does anyone have suggestions on how I can speed this up?

    Here is the explain:

    Function Scan on market_maker_books  (cost=0.05..3.05 rows=1000 width=164) (actual time=241.870..241.906 rows=491 loops=1)
      Buffers: shared hit=678 read=1251
      I/O Timings: read=194.877
    Planning Time: 0.029 ms
    Execution Time: 241.956 ms
    

  • QA Engineer

    Assuming current Postgres 14 for lack of declaration.

    Index

    A https://www.postgresql.org/docs/current/indexes-multicolumn.html on (underlying_symbol, expiration_date) would make your query fast. Any variant will do as long as underlying_symbol is the leading column: equality first, range later. See:

    • https://dba.stackexchange.com/questions/33196/multicolumn-index-and-performance/33220#33220
    • https://dba.stackexchange.com/questions/27481/is-a-composite-index-also-good-for-queries-on-the-first-field
    • https://dba.stackexchange.com/questions/90128/unused-index-in-range-of-dates-query/90183#90183

    Even a plain index on just (underlying_symbol) might go a long way - depending on undisclosed data distribution.

    Seeing that your table has very wide rows, a "covering" index might be ideal - if you get index-only scans out of it:

    CREATE INDEX foo ON cboe_raws (underlying_symbol, expiration_date DESC NULLS LAST)
    INCLUDE (strike_price, call_put_flag, mm_buy_vol, mm_sell_vol, mm_buy_vol);
    

    Function

    You typically don't need PL/pgSQL for a single, plain SQL query. Use an SQL function instead. I suggest the new standard-SQL variant:

    CREATE OR REPLACE FUNCTION market_maker_books(_symbol varchar)
      RETURNS TABLE (contract_underlying_symbol varchar
                   , contract_expiration_date date
                   , contract_strike_price float8
                   , contract_call_put_flag varchar
                   , contract_mm_buy_vol_sum bigint
                   , contract_mm_sell_vol_sum bigint
                   , contract_buys_minus_sells bigint)
      LANGUAGE sql STABLE STRICT PARALLEL SAFE
    BEGIN ATOMIC
    SELECT c.underlying_symbol
         , c.expiration_date
         , c.strike_price
         , c.call_put_flag
         , SUM(c.mm_buy_vol)  -- AS mm_buy_vol_sum
         , SUM(c.mm_sell_vol) -- AS mm_sell_vol_sum
         , SUM(c.mm_buy_vol) - SUM(c.mm_sell_vol) -- AS buys_minus_sells
    FROM   cboe_raws c
    WHERE  c.underlying_symbol = _symbol
    AND    c.expiration_date >= CURRENT_DATE
    GROUP  BY c.underlying_symbol, c.expiration_date, c.strike_price, c.call_put_flag
    ORDER  BY c.expiration_date, c.call_put_flag, c.strike_price;
    END;
    

    See:

    • https://dba.stackexchange.com/questions/300507/what-does-begin-atomic-end-mean-in-a-postgresql-sql-function-procedure/300512#300512

    Might be untangled and compacted further:

    CREATE OR REPLACE FUNCTION market_maker_books(_symbol varchar)
      RETURNS TABLE (expiration_date date
                   , call_put_flag varchar
                   , strike_price float8
                   , mm_buy_vol_sum bigint
                   , mm_sell_vol_sum bigint
                   , buys_minus_sells bigint)
      LANGUAGE sql STABLE STRICT PARALLEL SAFE
    BEGIN ATOMIC
    SELECT c.expiration_date, c.call_put_flag, c.strike_price
         , SUM(c.mm_buy_vol)
         , SUM(c.mm_sell_vol)
         , SUM(c.mm_buy_vol) - SUM(c.mm_sell_vol)
    FROM   cboe_raws c
    WHERE  c.underlying_symbol = _symbol
    AND    c.expiration_date >= CURRENT_DATE
    GROUP  BY 1, 2, 3
    ORDER  BY 1, 2, 3;
    END;
    



Suggested Topics

  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2