Python Tutorial Part III--Clean OptionMetrics

April 7, 2017 | Michael Gong

1. Clean OptionMetrics dataset

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline

raw = pd.read_csv('5Firms.csv.zip',parse_dates=[2,3])
raw.drop(columns=['secid','issuer','exercise_style','last_date'],inplace=True)
raw.head(5)

	date	exdate	cp_flag	strike_price	best_bid	best_offer	open_interest	imvol	delta	optionid	ticker	midquo
14	2012-01-03	2012-01-21	C	34.000	9.300	10.250	0	0.529	0.988	70433772	COF	9.775
15	2012-01-03	2012-01-21	C	35.000	8.600	9.200	1237	0.666	0.948	45658102	COF	8.900
16	2012-01-03	2012-01-21	C	36.000	7.350	8.300	10	0.515	0.965	67966062	COF	7.825
17	2012-01-03	2012-01-21	C	37.000	6.800	7.050	4	0.550	0.929	70433773	COF	6.925
18	2012-01-03	2012-01-21	C	38.000	5.850	6.000	589	0.481	0.921	54279804	COF	5.925

raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4467804 entries, 14 to 5719675
Data columns (total 14 columns):
date             datetime64[ns]
exdate           datetime64[ns]
cp_flag          object
strike_price     float64
best_bid         float64
best_offer       float64
volume           int64
open_interest    int64
imvol            float64
delta            float64
optionid         int64
ticker           object
index_flag       int64
midquo           float64
dtypes: datetime64[ns](2), float64(6), int64(4), object(2)
memory usage: 511.3+ MB

# change the implied volatility name (it's too long)
raw.rename(columns={'impl_volatility': 'imvol'}, inplace=True)
# transform the strike price and calculate the mid quotes
raw['strike_price'] = raw['strike_price'] / 1000.0
raw['midquo'] = (raw['best_bid'] + raw['best_offer']) / 2
raw.head(5)

	date	exdate	cp_flag	strike_price	best_bid	best_offer	open_interest	imvol	delta	optionid	ticker	midquo
14	2012-01-03	2012-01-21	C	0.034	9.300	10.250	0	0.529	0.988	70433772	COF	9.775
15	2012-01-03	2012-01-21	C	0.035	8.600	9.200	1237	0.666	0.948	45658102	COF	8.900
16	2012-01-03	2012-01-21	C	0.036	7.350	8.300	10	0.515	0.965	67966062	COF	7.825
17	2012-01-03	2012-01-21	C	0.037	6.800	7.050	4	0.550	0.929	70433773	COF	6.925
18	2012-01-03	2012-01-21	C	0.038	5.850	6.000	589	0.481	0.921	54279804	COF	5.925

# drop contracts with NaN implied volatility or Nan delta
raw.dropna(subset=['imvol'], how='any', inplace=True)
raw.dropna(subset=['delta'], how='any', inplace=True)

# Exclude implied volatility above 150%
raw = raw[raw['imvol'] < 1.5]
# Exclude price below 0.05
raw = raw[raw['best_offer'] > 0.05]
raw.head(5)

	date	exdate	cp_flag	strike_price	best_bid	best_offer	open_interest	imvol	delta	optionid	ticker	midquo
14	2012-01-03	2012-01-21	C	0.034	9.300	10.250	0	0.529	0.988	70433772	COF	9.775
15	2012-01-03	2012-01-21	C	0.035	8.600	9.200	1237	0.666	0.948	45658102	COF	8.900
16	2012-01-03	2012-01-21	C	0.036	7.350	8.300	10	0.515	0.965	67966062	COF	7.825
17	2012-01-03	2012-01-21	C	0.037	6.800	7.050	4	0.550	0.929	70433773	COF	6.925
18	2012-01-03	2012-01-21	C	0.038	5.850	6.000	589	0.481	0.921	54279804	COF	5.925

# calculate maturity
raw['maturity'] = ((raw['exdate'] - raw['date']) / np.timedelta64(1, 'D')).astype(int)
# Exclude contracts with maturity longer than 240 days and shorter than 7 days
raw = raw[(raw['maturity'] <= 360) & (raw['maturity'] >= 7)]
raw.head(5)

	date	exdate	cp_flag	strike_price	best_bid	best_offer	open_interest	imvol	delta	optionid	ticker	midquo	maturity
14	2012-01-03	2012-01-21	C	0.034	9.300	10.250	0	0.529	0.988	70433772	COF	9.775	18
15	2012-01-03	2012-01-21	C	0.035	8.600	9.200	1237	0.666	0.948	45658102	COF	8.900	18
16	2012-01-03	2012-01-21	C	0.036	7.350	8.300	10	0.515	0.965	67966062	COF	7.825	18
17	2012-01-03	2012-01-21	C	0.037	6.800	7.050	4	0.550	0.929	70433773	COF	6.925	18
18	2012-01-03	2012-01-21	C	0.038	5.850	6.000	589	0.481	0.921	54279804	COF	5.925	18

# save the cleaned raw data, because it is time-consuming to processing, next time we just need to load it
hdf = pd.HDFStore('hdf.h5')
hdf['raw'] = raw
hdf.close()

# if next time we start from here
# hdf = pd.HDFStore('hdf.h5')
# raw = hdf['raw']
# hdf.close()

2. Some summary statistics

# select variable we want 
clean = raw[['date', 'ticker', 'imvol', 'delta', 'maturity', 'cp_flag']].reset_index(drop=True)
clean.head(5)

	date	ticker	imvol	delta	maturity	cp_flag
0	2012-01-03	COF	0.529	0.988	18	C
1	2012-01-03	COF	0.666	0.948	18	C
2	2012-01-03	COF	0.515	0.965	18	C
3	2012-01-03	COF	0.550	0.929	18	C
4	2012-01-03	COF	0.481	0.921	18	C

print('stocks ticker:\n',clean['ticker'].unique())
print('maturity range:\n',clean['maturity'].min(),clean['maturity'].max())
print('delta range:\n',clean['delta'].min(),clean['delta'].max())

stocks ticker:
 ['COF' 'DD' 'XOM' 'INTC' 'SPX' 'UNH']
maturity range:
 7 360
delta range:
 -0.999934 0.999999

stats = clean.groupby(['ticker','cp_flag'])[['imvol','delta']].describe()
stats

		imvol								delta
		count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
ticker	cp_flag
COF	C	100419.000	0.295	0.146	0.063	0.199	0.249	0.341	1.000	100419.000	0.523	0.359	0.008	0.130	0.577	0.884	1.000
COF	P	108812.000	0.319	0.144	0.057	0.213	0.280	0.386	1.000	108812.000	-0.372	0.340	-1.000	-0.710	-0.240	-0.059	-0.003
DD	C	125769.000	0.264	0.138	0.063	0.178	0.217	0.299	0.999	125769.000	0.553	0.366	0.008	0.146	0.639	0.917	1.000
DD	P	134941.000	0.262	0.105	0.047	0.187	0.233	0.313	0.999	134941.000	-0.392	0.352	-1.000	-0.760	-0.255	-0.063	-0.004
INTC	C	113863.000	0.306	0.150	0.084	0.218	0.248	0.331	1.000	113863.000	0.577	0.349	0.015	0.209	0.660	0.922	1.000
INTC	P	127602.000	0.293	0.113	0.074	0.221	0.260	0.334	1.000	127602.000	-0.505	0.359	-1.000	-0.882	-0.514	-0.122	-0.007
SPX	C	1240711.000	0.245	0.151	0.050	0.139	0.202	0.297	1.000	1240711.000	0.674	0.361	0.001	0.373	0.869	0.970	1.000
SPX	P	1344534.000	0.259	0.135	0.033	0.157	0.233	0.326	1.000	1344534.000	-0.251	0.328	-1.000	-0.419	-0.066	-0.010	-0.000
UNH	C	86588.000	0.280	0.133	0.076	0.204	0.236	0.303	1.000	86588.000	0.536	0.364	0.007	0.141	0.600	0.904	1.000
UNH	P	92393.000	0.290	0.118	0.066	0.213	0.255	0.333	0.999	92393.000	-0.395	0.352	-1.000	-0.759	-0.271	-0.058	-0.003
XOM	C	112332.000	0.240	0.140	0.034	0.156	0.187	0.265	1.000	112332.000	0.535	0.358	0.006	0.148	0.601	0.889	1.000
XOM	P	126483.000	0.244	0.110	0.039	0.169	0.213	0.287	0.999	126483.000	-0.418	0.353	-1.000	-0.796	-0.316	-0.073	-0.003

3. Transform regression variables

# constant
clean['const'] = 1
# scaled moneyness
clean['delta_n'] = clean['delta']
clean.loc[(clean['cp_flag']=='P'),'delta_n'] = 1 + clean.loc[(clean['cp_flag']=='P'),'delta']
clean['delta_n'] = clean['delta_n'] - 0.5
# scaled maturity
clean['matur_n'] = clean['maturity']/360 - 0.5
# quadratic term of moneyness
clean['delta2'] = clean['delta_n']**2
# interaction between maturity and moneyness
clean['inter'] = clean['delta_n']*clean['matur_n']
clean.head(5)

	date	ticker	imvol	delta	maturity	cp_flag	const	delta_n	matur_n	delta2	inter
0	2012-01-03	COF	0.529	0.988	18	C	1	0.488	-0.450	0.239	-0.220
1	2012-01-03	COF	0.666	0.948	18	C	1	0.448	-0.450	0.201	-0.202
2	2012-01-03	COF	0.515	0.965	18	C	1	0.465	-0.450	0.216	-0.209
3	2012-01-03	COF	0.550	0.929	18	C	1	0.429	-0.450	0.184	-0.193
4	2012-01-03	COF	0.481	0.921	18	C	1	0.421	-0.450	0.177	-0.189

4. Cross-section regression

import statsmodels.api as sm

4.1 straightforward way

tickers = clean['ticker'].unique()
ticker = tickers[0]

tmp = clean[clean['ticker']==ticker]
dates = tmp['date'].unique()
tmp = tmp[tmp['date']==dates[0]]
tmp.head(5)

	date	ticker	imvol	delta	maturity	cp_flag	const	delta_n	matur_n	delta2	inter
0	2012-01-03	COF	0.529	0.988	18	C	1	0.488	-0.450	0.239	-0.220
1	2012-01-03	COF	0.666	0.948	18	C	1	0.448	-0.450	0.201	-0.202
2	2012-01-03	COF	0.515	0.965	18	C	1	0.465	-0.450	0.216	-0.209
3	2012-01-03	COF	0.550	0.929	18	C	1	0.429	-0.450	0.184	-0.193
4	2012-01-03	COF	0.481	0.921	18	C	1	0.421	-0.450	0.177	-0.189

exogs = ['const','delta_n','matur_n','delta2','inter']
mB = np.linalg.lstsq(tmp[exogs],tmp['imvol'])[0]

mB

array([     0.272,      0.176,     -0.170,      0.920,      0.054])

4.2 More concise way

def CrossReg(df,exogs):
    mod = sm.OLS(df['imvol'],df[exogs]).fit()
    ind = pd.MultiIndex.from_product([['params','tvalues'],exogs])
    ret = pd.Series(index=ind)
    ret.loc['params'] = mod.params.values
    ret.loc['tvalues'] = mod.tvalues.values
    ret.loc['r2'] = mod.rsquared
    return ret 

exogs = ['const','delta_n','matur_n','delta2','inter']
mB = clean.groupby(['ticker','date']).apply(lambda x:CrossReg(x,exogs))

mB.loc['SPX']['params'].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f2d109103c8>

png

4.3 Need for speed

4.3.1 Just for coefficients, sequential

def CrossRegSpeed(df,exogs):
    mB = np.linalg.lstsq(df[exogs],df['imvol'])[0]
    return pd.Series(mB,index=exogs)
mB = clean.groupby(['ticker','date']).apply(lambda x:CrossRegSpeed(x,exogs))

4.3.2 Just for coefficients, parallel

import multiprocessing as mp
from functools import partial

def CrossRegParHelper(ticker):
    tmp = DATA[DATA['ticker']==ticker]
    mB = tmp.groupby(['date']).apply(lambda x:CrossRegSpeed(x,EXOGS))
    mB['ticker'] = ticker
    return mB

def CrossRegPar(data,exogs):
    global DATA, EXOGS
    DATA = data 
    EXOGS = exogs
    pool = mp.Pool(processes=12)
    results = pool.map(CrossRegParHelper, tickers)
    pool.close()
    pool.join()
    mB = pd.concat(results)
    return mB

mB = CrossRegPar(clean,exogs)

%timeit clean.groupby(['ticker','date']).apply(lambda x:CrossRegSpeed(x,exogs))

8 s ± 33.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

%timeit CrossRegPar(clean,exogs)

4.2 s ± 51.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

4.3.3 JIT compilation

import numba as nb

clean.sort_values(by=['ticker','date'],inplace=True)
locs = clean.groupby(['ticker','date']).size().values.cumsum()
locs = np.insert(locs,0,0)
mY = clean[['imvol']].values
mX = clean[exogs].values

@nb.jit
def CrossRegJIT(mX,mY,locs):
    N = locs.shape[0]
    mB = np.zeros((mX.shape[1],N-1))
    for i in range(0,N-1):
        b = np.linalg.lstsq(mX[locs[i]:locs[i+1]],mY[locs[i]:locs[i+1]])[0]
        mB[:,[i]] = b
    return mB

mB = CrossRegJIT(mX,mY,locs)

%timeit CrossRegJIT(mX,mY,locs)

702 ms ± 6.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

mB = pd.DataFrame(mB.T,index=clean.groupby(['ticker','date']).size().index,columns=exogs)

mB.head(5)

	const	delta_n	matur_n	delta2	inter	ticker
date
2012-01-03	0.272	0.176	-0.170	0.920	0.054	COF
2012-01-04	0.288	0.079	-0.090	1.023	-0.263	COF
2012-01-05	0.288	0.160	-0.129	0.776	-0.110	COF
2012-01-06	0.251	0.246	-0.149	1.189	0.076	COF
2012-01-09	0.267	0.176	-0.167	0.881	0.312	COF

5. Principal Component Analysis

mB.reset_index(inplace=True)
level = pd.pivot_table(mB,index='date',columns='ticker',values='const')

level.plot()

png

from sklearn import preprocessing
import scipy as sp
def pcaAnalysis(mX, iPC, useCov=1):
    if useCov:
        # mCov = np.cov(mX, rowvar = 0)
        de_meanX = mX - np.tile(np.mean(mX, axis=0), [mX.shape[0], 1])
        mCov = np.dot(de_meanX.transpose(), de_meanX) / mX.shape[0]

        eigval, eigvec = sp.linalg.eigh(mCov)
        eigidx = np.argsort(eigval)
        maxidx = eigidx[::-1]
        sorted_vec = eigvec[:, maxidx]

        mPC = np.dot(de_meanX, sorted_vec)
        return mPC[:, 0:iPC], (eigval[maxidx] / np.sum(eigval))[0:iPC]

    else:
        mCoef = np.corrcoef(mX, rowvar=0)
        scaled_X = preprocessing.scale(mX)
        eigval, eigvec = sp.linalg.eigh(mCoef)

        eigidx = np.argsort(eigval)
        maxidx = eigidx[::-1]
        sorted_vec = eigvec[:, maxidx]

        mPC = np.dot(scaled_X, sorted_vec)
        return preprocessing.scale(mPC[:, 0:iPC]), (eigval[maxidx] /
                                                    np.sum(eigval))[0:iPC]

mPC,pct = pcaAnalysis(level.values[:500],3)
pct

array([  7.22e-01,   9.53e-02,   6.82e-02])

plt.plot(mPC)

[<matplotlib.lines.Line2D at 0x7f054028cb70>,
 <matplotlib.lines.Line2D at 0x7f054028c518>,
 <matplotlib.lines.Line2D at 0x7f054028cf28>]

png