Python Tutorial Part III--Clean OptionMetrics

April 7, 2017 | Michael Gong

1. Clean OptionMetrics dataset

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline
raw = pd.read_csv('5Firms.csv.zip',parse_dates=[2,3])
raw.drop(columns=['secid','issuer','exercise_style','last_date'],inplace=True)
raw.head(5)
date exdate cp_flag strike_price best_bid best_offer volume open_interest imvol delta optionid ticker index_flag midquo
14 2012-01-03 2012-01-21 C 34.000 9.300 10.250 0 0 0.529 0.988 70433772 COF 0 9.775
15 2012-01-03 2012-01-21 C 35.000 8.600 9.200 0 1237 0.666 0.948 45658102 COF 0 8.900
16 2012-01-03 2012-01-21 C 36.000 7.350 8.300 0 10 0.515 0.965 67966062 COF 0 7.825
17 2012-01-03 2012-01-21 C 37.000 6.800 7.050 0 4 0.550 0.929 70433773 COF 0 6.925
18 2012-01-03 2012-01-21 C 38.000 5.850 6.000 0 589 0.481 0.921 54279804 COF 0 5.925
raw.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4467804 entries, 14 to 5719675
Data columns (total 14 columns):
date             datetime64[ns]
exdate           datetime64[ns]
cp_flag          object
strike_price     float64
best_bid         float64
best_offer       float64
volume           int64
open_interest    int64
imvol            float64
delta            float64
optionid         int64
ticker           object
index_flag       int64
midquo           float64
dtypes: datetime64[ns](2), float64(6), int64(4), object(2)
memory usage: 511.3+ MB
# change the implied volatility name (it's too long)
raw.rename(columns={'impl_volatility': 'imvol'}, inplace=True)
# transform the strike price and calculate the mid quotes
raw['strike_price'] = raw['strike_price'] / 1000.0
raw['midquo'] = (raw['best_bid'] + raw['best_offer']) / 2
raw.head(5)
date exdate cp_flag strike_price best_bid best_offer volume open_interest imvol delta optionid ticker index_flag midquo
14 2012-01-03 2012-01-21 C 0.034 9.300 10.250 0 0 0.529 0.988 70433772 COF 0 9.775
15 2012-01-03 2012-01-21 C 0.035 8.600 9.200 0 1237 0.666 0.948 45658102 COF 0 8.900
16 2012-01-03 2012-01-21 C 0.036 7.350 8.300 0 10 0.515 0.965 67966062 COF 0 7.825
17 2012-01-03 2012-01-21 C 0.037 6.800 7.050 0 4 0.550 0.929 70433773 COF 0 6.925
18 2012-01-03 2012-01-21 C 0.038 5.850 6.000 0 589 0.481 0.921 54279804 COF 0 5.925
# drop contracts with NaN implied volatility or Nan delta
raw.dropna(subset=['imvol'], how='any', inplace=True)
raw.dropna(subset=['delta'], how='any', inplace=True)
# Exclude implied volatility above 150%
raw = raw[raw['imvol'] < 1.5]
# Exclude price below 0.05
raw = raw[raw['best_offer'] > 0.05]
raw.head(5)
date exdate cp_flag strike_price best_bid best_offer volume open_interest imvol delta optionid ticker index_flag midquo
14 2012-01-03 2012-01-21 C 0.034 9.300 10.250 0 0 0.529 0.988 70433772 COF 0 9.775
15 2012-01-03 2012-01-21 C 0.035 8.600 9.200 0 1237 0.666 0.948 45658102 COF 0 8.900
16 2012-01-03 2012-01-21 C 0.036 7.350 8.300 0 10 0.515 0.965 67966062 COF 0 7.825
17 2012-01-03 2012-01-21 C 0.037 6.800 7.050 0 4 0.550 0.929 70433773 COF 0 6.925
18 2012-01-03 2012-01-21 C 0.038 5.850 6.000 0 589 0.481 0.921 54279804 COF 0 5.925
# calculate maturity
raw['maturity'] = ((raw['exdate'] - raw['date']) / np.timedelta64(1, 'D')).astype(int)
# Exclude contracts with maturity longer than 240 days and shorter than 7 days
raw = raw[(raw['maturity'] <= 360) & (raw['maturity'] >= 7)]
raw.head(5)
date exdate cp_flag strike_price best_bid best_offer volume open_interest imvol delta optionid ticker index_flag midquo maturity
14 2012-01-03 2012-01-21 C 0.034 9.300 10.250 0 0 0.529 0.988 70433772 COF 0 9.775 18
15 2012-01-03 2012-01-21 C 0.035 8.600 9.200 0 1237 0.666 0.948 45658102 COF 0 8.900 18
16 2012-01-03 2012-01-21 C 0.036 7.350 8.300 0 10 0.515 0.965 67966062 COF 0 7.825 18
17 2012-01-03 2012-01-21 C 0.037 6.800 7.050 0 4 0.550 0.929 70433773 COF 0 6.925 18
18 2012-01-03 2012-01-21 C 0.038 5.850 6.000 0 589 0.481 0.921 54279804 COF 0 5.925 18
# save the cleaned raw data, because it is time-consuming to processing, next time we just need to load it
hdf = pd.HDFStore('hdf.h5')
hdf['raw'] = raw
hdf.close()
# if next time we start from here
# hdf = pd.HDFStore('hdf.h5')
# raw = hdf['raw']
# hdf.close()

2. Some summary statistics

# select variable we want 
clean = raw[['date', 'ticker', 'imvol', 'delta', 'maturity', 'cp_flag']].reset_index(drop=True)
clean.head(5)
date ticker imvol delta maturity cp_flag
0 2012-01-03 COF 0.529 0.988 18 C
1 2012-01-03 COF 0.666 0.948 18 C
2 2012-01-03 COF 0.515 0.965 18 C
3 2012-01-03 COF 0.550 0.929 18 C
4 2012-01-03 COF 0.481 0.921 18 C
print('stocks ticker:\n',clean['ticker'].unique())
print('maturity range:\n',clean['maturity'].min(),clean['maturity'].max())
print('delta range:\n',clean['delta'].min(),clean['delta'].max())
stocks ticker:
 ['COF' 'DD' 'XOM' 'INTC' 'SPX' 'UNH']
maturity range:
 7 360
delta range:
 -0.999934 0.999999
stats = clean.groupby(['ticker','cp_flag'])[['imvol','delta']].describe()
stats
imvol delta
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
ticker cp_flag
COF C 100419.000 0.295 0.146 0.063 0.199 0.249 0.341 1.000 100419.000 0.523 0.359 0.008 0.130 0.577 0.884 1.000
P 108812.000 0.319 0.144 0.057 0.213 0.280 0.386 1.000 108812.000 -0.372 0.340 -1.000 -0.710 -0.240 -0.059 -0.003
DD C 125769.000 0.264 0.138 0.063 0.178 0.217 0.299 0.999 125769.000 0.553 0.366 0.008 0.146 0.639 0.917 1.000
P 134941.000 0.262 0.105 0.047 0.187 0.233 0.313 0.999 134941.000 -0.392 0.352 -1.000 -0.760 -0.255 -0.063 -0.004
INTC C 113863.000 0.306 0.150 0.084 0.218 0.248 0.331 1.000 113863.000 0.577 0.349 0.015 0.209 0.660 0.922 1.000
P 127602.000 0.293 0.113 0.074 0.221 0.260 0.334 1.000 127602.000 -0.505 0.359 -1.000 -0.882 -0.514 -0.122 -0.007
SPX C 1240711.000 0.245 0.151 0.050 0.139 0.202 0.297 1.000 1240711.000 0.674 0.361 0.001 0.373 0.869 0.970 1.000
P 1344534.000 0.259 0.135 0.033 0.157 0.233 0.326 1.000 1344534.000 -0.251 0.328 -1.000 -0.419 -0.066 -0.010 -0.000
UNH C 86588.000 0.280 0.133 0.076 0.204 0.236 0.303 1.000 86588.000 0.536 0.364 0.007 0.141 0.600 0.904 1.000
P 92393.000 0.290 0.118 0.066 0.213 0.255 0.333 0.999 92393.000 -0.395 0.352 -1.000 -0.759 -0.271 -0.058 -0.003
XOM C 112332.000 0.240 0.140 0.034 0.156 0.187 0.265 1.000 112332.000 0.535 0.358 0.006 0.148 0.601 0.889 1.000
P 126483.000 0.244 0.110 0.039 0.169 0.213 0.287 0.999 126483.000 -0.418 0.353 -1.000 -0.796 -0.316 -0.073 -0.003

3. Transform regression variables

# constant
clean['const'] = 1
# scaled moneyness
clean['delta_n'] = clean['delta']
clean.loc[(clean['cp_flag']=='P'),'delta_n'] = 1 + clean.loc[(clean['cp_flag']=='P'),'delta']
clean['delta_n'] = clean['delta_n'] - 0.5
# scaled maturity
clean['matur_n'] = clean['maturity']/360 - 0.5
# quadratic term of moneyness
clean['delta2'] = clean['delta_n']**2
# interaction between maturity and moneyness
clean['inter'] = clean['delta_n']*clean['matur_n']
clean.head(5)
date ticker imvol delta maturity cp_flag const delta_n matur_n delta2 inter
0 2012-01-03 COF 0.529 0.988 18 C 1 0.488 -0.450 0.239 -0.220
1 2012-01-03 COF 0.666 0.948 18 C 1 0.448 -0.450 0.201 -0.202
2 2012-01-03 COF 0.515 0.965 18 C 1 0.465 -0.450 0.216 -0.209
3 2012-01-03 COF 0.550 0.929 18 C 1 0.429 -0.450 0.184 -0.193
4 2012-01-03 COF 0.481 0.921 18 C 1 0.421 -0.450 0.177 -0.189

4. Cross-section regression

import statsmodels.api as sm

4.1 straightforward way

tickers = clean['ticker'].unique()
ticker = tickers[0]

tmp = clean[clean['ticker']==ticker]
dates = tmp['date'].unique()
tmp = tmp[tmp['date']==dates[0]]
tmp.head(5)
date ticker imvol delta maturity cp_flag const delta_n matur_n delta2 inter
0 2012-01-03 COF 0.529 0.988 18 C 1 0.488 -0.450 0.239 -0.220
1 2012-01-03 COF 0.666 0.948 18 C 1 0.448 -0.450 0.201 -0.202
2 2012-01-03 COF 0.515 0.965 18 C 1 0.465 -0.450 0.216 -0.209
3 2012-01-03 COF 0.550 0.929 18 C 1 0.429 -0.450 0.184 -0.193
4 2012-01-03 COF 0.481 0.921 18 C 1 0.421 -0.450 0.177 -0.189
exogs = ['const','delta_n','matur_n','delta2','inter']
mB = np.linalg.lstsq(tmp[exogs],tmp['imvol'])[0]
mB
array([     0.272,      0.176,     -0.170,      0.920,      0.054])

4.2 More concise way

def CrossReg(df,exogs):
    mod = sm.OLS(df['imvol'],df[exogs]).fit()
    ind = pd.MultiIndex.from_product([['params','tvalues'],exogs])
    ret = pd.Series(index=ind)
    ret.loc['params'] = mod.params.values
    ret.loc['tvalues'] = mod.tvalues.values
    ret.loc['r2'] = mod.rsquared
    return ret 

exogs = ['const','delta_n','matur_n','delta2','inter']
mB = clean.groupby(['ticker','date']).apply(lambda x:CrossReg(x,exogs))
mB.loc['SPX']['params'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f2d109103c8>

png

4.3 Need for speed

4.3.1 Just for coefficients, sequential

def CrossRegSpeed(df,exogs):
    mB = np.linalg.lstsq(df[exogs],df['imvol'])[0]
    return pd.Series(mB,index=exogs)
mB = clean.groupby(['ticker','date']).apply(lambda x:CrossRegSpeed(x,exogs))

4.3.2 Just for coefficients, parallel

import multiprocessing as mp
from functools import partial

def CrossRegParHelper(ticker):
    tmp = DATA[DATA['ticker']==ticker]
    mB = tmp.groupby(['date']).apply(lambda x:CrossRegSpeed(x,EXOGS))
    mB['ticker'] = ticker
    return mB

def CrossRegPar(data,exogs):
    global DATA, EXOGS
    DATA = data 
    EXOGS = exogs
    pool = mp.Pool(processes=12)
    results = pool.map(CrossRegParHelper, tickers)
    pool.close()
    pool.join()
    mB = pd.concat(results)
    return mB

mB = CrossRegPar(clean,exogs)
%timeit clean.groupby(['ticker','date']).apply(lambda x:CrossRegSpeed(x,exogs))
8 s ± 33.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit CrossRegPar(clean,exogs)
4.2 s ± 51.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

4.3.3 JIT compilation

import numba as nb
clean.sort_values(by=['ticker','date'],inplace=True)
locs = clean.groupby(['ticker','date']).size().values.cumsum()
locs = np.insert(locs,0,0)
mY = clean[['imvol']].values
mX = clean[exogs].values
@nb.jit
def CrossRegJIT(mX,mY,locs):
    N = locs.shape[0]
    mB = np.zeros((mX.shape[1],N-1))
    for i in range(0,N-1):
        b = np.linalg.lstsq(mX[locs[i]:locs[i+1]],mY[locs[i]:locs[i+1]])[0]
        mB[:,[i]] = b
    return mB

mB = CrossRegJIT(mX,mY,locs)
%timeit CrossRegJIT(mX,mY,locs)
702 ms ± 6.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
mB = pd.DataFrame(mB.T,index=clean.groupby(['ticker','date']).size().index,columns=exogs)
mB.head(5)
const delta_n matur_n delta2 inter ticker
date
2012-01-03 0.272 0.176 -0.170 0.920 0.054 COF
2012-01-04 0.288 0.079 -0.090 1.023 -0.263 COF
2012-01-05 0.288 0.160 -0.129 0.776 -0.110 COF
2012-01-06 0.251 0.246 -0.149 1.189 0.076 COF
2012-01-09 0.267 0.176 -0.167 0.881 0.312 COF

5. Principal Component Analysis

mB.reset_index(inplace=True)
level = pd.pivot_table(mB,index='date',columns='ticker',values='const')
level.plot()

png

from sklearn import preprocessing
import scipy as sp
def pcaAnalysis(mX, iPC, useCov=1):
    if useCov:
        # mCov = np.cov(mX, rowvar = 0)
        de_meanX = mX - np.tile(np.mean(mX, axis=0), [mX.shape[0], 1])
        mCov = np.dot(de_meanX.transpose(), de_meanX) / mX.shape[0]

        eigval, eigvec = sp.linalg.eigh(mCov)
        eigidx = np.argsort(eigval)
        maxidx = eigidx[::-1]
        sorted_vec = eigvec[:, maxidx]

        mPC = np.dot(de_meanX, sorted_vec)
        return mPC[:, 0:iPC], (eigval[maxidx] / np.sum(eigval))[0:iPC]

    else:
        mCoef = np.corrcoef(mX, rowvar=0)
        scaled_X = preprocessing.scale(mX)
        eigval, eigvec = sp.linalg.eigh(mCoef)

        eigidx = np.argsort(eigval)
        maxidx = eigidx[::-1]
        sorted_vec = eigvec[:, maxidx]

        mPC = np.dot(scaled_X, sorted_vec)
        return preprocessing.scale(mPC[:, 0:iPC]), (eigval[maxidx] /
                                                    np.sum(eigval))[0:iPC]
mPC,pct = pcaAnalysis(level.values[:500],3)
pct
array([  7.22e-01,   9.53e-02,   6.82e-02])
plt.plot(mPC)
[<matplotlib.lines.Line2D at 0x7f054028cb70>,
 <matplotlib.lines.Line2D at 0x7f054028c518>,
 <matplotlib.lines.Line2D at 0x7f054028cf28>]

png