"""
Diagnostics for regression estimations.
"""
__author__ = "Luc Anselin luc.anselin@asu.edu, Nicholas Malizia nicholas.malizia@asu.edu "
from libpysal.common import *
import scipy.sparse as SP
from math import sqrt, pi
from .utils import spmultiply, sphstack, spmin, spmax
__all__ = [
"f_stat", "t_stat", "r2", "ar2", "se_betas", "log_likelihood", "akaike", "schwarz",
"condition_index", "jarque_bera", "breusch_pagan", "white", "koenker_bassett", "vif", "likratiotest",
"constant_check"]
[docs]def f_stat(reg):
"""
Calculates the f-statistic and associated p-value of the
regression. :cite:`Greene2003`.
(For two stage least squares see f_stat_tsls)
Parameters
----------
reg : regression object
output instance from a regression model
Returns
----------
fs_result : tuple
includes value of F statistic and associated p-value
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the F-statistic for the regression.
>>> testresult = spreg.f_stat(reg)
Print the results tuple, including the statistic and its significance.
>>> print("%12.12f"%testresult[0],"%12.12f"%testresult[1])
28.385629224695 0.000000009341
"""
k = reg.k # (scalar) number of ind. vars (includes constant)
n = reg.n # (scalar) number of observations
utu = reg.utu # (scalar) residual sum of squares
predy = reg.predy # (array) vector of predicted values (n x 1)
mean_y = reg.mean_y # (scalar) mean of dependent observations
Q = utu
U = np.sum((predy - mean_y) ** 2)
fStat = (U / (k - 1)) / (Q / (n - k))
pValue = stats.f.sf(fStat, k - 1, n - k)
fs_result = (fStat, pValue)
return fs_result
def t_stat(reg, z_stat=False):
"""
Calculates the t-statistics (or z-statistics) and associated
p-values. :cite:`Greene2003`
Parameters
----------
reg : regression object
output instance from a regression model
z_stat : boolean
If True run z-stat instead of t-stat
Returns
-------
ts_result : list of tuples
each tuple includes value of t statistic (or z
statistic) and associated p-value
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate t-statistics for the regression coefficients.
>>> testresult = spreg.t_stat(reg)
Print the tuples that contain the t-statistics and their significances.
>>> print("%12.12f"%testresult[0][0], "%12.12f"%testresult[0][1], "%12.12f"%testresult[1][0], "%12.12f"%testresult[1][1], "%12.12f"%testresult[2][0], "%12.12f"%testresult[2][1])
14.490373143689 0.000000000000 -4.780496191297 0.000018289595 -2.654408642718 0.010874504910
"""
k = reg.k # (scalar) number of ind. vars (includes constant)
n = reg.n # (scalar) number of observations
vm = reg.vm # (array) coefficients of variance matrix (k x k)
betas = reg.betas # (array) coefficients of the regressors (1 x k)
variance = vm.diagonal()
tStat = betas[list(range(0, len(vm)))].reshape(len(vm),) / np.sqrt(variance)
ts_result = []
for t in tStat:
if z_stat:
ts_result.append((t, stats.norm.sf(abs(t)) * 2))
else:
ts_result.append((t, stats.t.sf(abs(t), n - k) * 2))
return ts_result
[docs]def r2(reg):
"""
Calculates the R^2 value for the regression. :cite:`Greene2003`
Parameters
----------
reg : regression object
output instance from a regression model
Returns
----------
r2_result : float
value of the coefficient of determination for the
regression
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the R^2 value for the regression.
>>> testresult = spreg.r2(reg)
Print the result.
>>> print("%1.8f"%testresult)
0.55240404
"""
y = reg.y # (array) vector of dep observations (n x 1)
mean_y = reg.mean_y # (scalar) mean of dep observations
utu = reg.utu # (scalar) residual sum of squares
ss_tot = ((y - mean_y) ** 2).sum(0)
r2 = 1 - utu / ss_tot
r2_result = r2[0]
return r2_result
[docs]def ar2(reg):
"""
Calculates the adjusted R^2 value for the regression. :cite:`Greene2003`
Parameters
----------
reg : regression object
output instance from a regression model
Returns
----------
ar2_result : float
value of R^2 adjusted for the number of explanatory
variables.
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the adjusted R^2 value for the regression.
>>> testresult = spreg.ar2(reg)
Print the result.
>>> print("%1.8f"%testresult)
0.53294335
"""
k = reg.k # (scalar) number of ind. variables (includes constant)
n = reg.n # (scalar) number of observations
ar2_result = 1 - (1 - r2(reg)) * (n - 1) / (n - k)
return ar2_result
[docs]def se_betas(reg):
"""
Calculates the standard error of the regression coefficients. :cite:`Greene2003`
Parameters
----------
reg : regression object
output instance from a regression model
Returns
----------
se_result : array
includes standard errors of each coefficient (1 x k)
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the standard errors of the regression coefficients.
>>> testresult = spreg.se_betas(reg)
Print the vector of standard errors.
>>> testresult
array([4.73548613, 0.33413076, 0.10319868])
"""
vm = reg.vm # (array) coefficients of variance matrix (k x k)
variance = vm.diagonal()
se_result = np.sqrt(variance)
return se_result
[docs]def log_likelihood(reg):
"""
Calculates the log-likelihood value for the regression. :cite:`Greene2003`
Parameters
----------
reg : regression object
output instance from a regression model
Returns
-------
ll_result : float
value for the log-likelihood of the regression.
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the log-likelihood for the regression.
>>> testresult = spreg.log_likelihood(reg)
Print the result.
>>> testresult
-187.3772388121491
"""
n = reg.n # (scalar) number of observations
utu = reg.utu # (scalar) residual sum of squares
ll_result = -0.5 * \
(n * (np.log(2 * pi)) + n * np.log(utu / n) + (utu / (utu / n)))
return ll_result
[docs]def akaike(reg):
"""
Calculates the Akaike Information Criterion. :cite:`Akaike1974`
Parameters
----------
reg : regression object
output instance from a regression model
Returns
-------
aic_result : scalar
value for Akaike Information Criterion of the
regression.
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the Akaike Information Criterion (AIC).
>>> testresult = spreg.akaike(reg)
Print the result.
>>> testresult
380.7544776242982
"""
k = reg.k # (scalar) number of explanatory vars (including constant)
try: # ML estimation, logll already exists
# spatial coefficient included in k
aic_result = 2.0 * k - 2.0 * reg.logll
except AttributeError: # OLS case
n = reg.n # (scalar) number of observations
utu = reg.utu # (scalar) residual sum of squares
aic_result = 2 * k + n * (np.log((2 * np.pi * utu) / n) + 1)
return aic_result
[docs]def schwarz(reg):
"""
Calculates the Schwarz Information Criterion. :cite:`Schwarz1978`
Parameters
----------
reg : regression object
output instance from a regression model
Returns
-------
bic_result : scalar
value for Schwarz (Bayesian) Information Criterion of
the regression.
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the Schwarz Information Criterion.
>>> testresult = spreg.schwarz(reg)
Print the results.
>>> np.round(testresult, 5)
386.42994
"""
n = reg.n # (scalar) number of observations
k = reg.k # (scalar) number of ind. variables (including constant)
try: # ML case logll already computed
# spatial coeff included in k
sc_result = k * np.log(n) - 2.0 * reg.logll
except AttributeError: # OLS case
utu = reg.utu # (scalar) residual sum of squares
sc_result = k * np.log(n) + n * (np.log((2 * np.pi * utu) / n) + 1)
return sc_result
[docs]def condition_index(reg):
"""
Calculates the multicollinearity condition index according to Belsey,
Kuh and Welsh (1980) :cite:`Belsley1980`.
Parameters
----------
reg : regression object
output instance from a regression model
Returns
-------
ci_result : float
scalar value for the multicollinearity condition
index.
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the condition index to check for multicollinearity.
>>> testresult = spreg.condition_index(reg)
Print the result.
>>> print("%1.3f"%testresult)
6.542
"""
if hasattr(reg, 'xtx'):
xtx = reg.xtx # (array) k x k projection matrix (includes constant)
elif hasattr(reg, 'hth'):
xtx = reg.hth # (array) k x k projection matrix (includes constant)
diag = np.diagonal(xtx)
scale = xtx / diag
eigval = np.linalg.eigvals(scale)
max_eigval = max(eigval)
min_eigval = min(eigval)
ci_result = sqrt(max_eigval / min_eigval)
return ci_result
[docs]def jarque_bera(reg):
"""
Jarque-Bera test for normality in the residuals. :cite:`Jarque1980`
Parameters
----------
reg : regression object
output instance from a regression model
Returns
-------
jb_result : dictionary
contains the statistic (jb) for the Jarque-Bera test
and the associated p-value (p-value)
df : integer
degrees of freedom for the test (always 2)
jb : float
value of the test statistic
pvalue : float
p-value associated with the statistic (chi^2
distributed with 2 df)
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"), "r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the Jarque-Bera test for normality of residuals.
>>> testresult = spreg.jarque_bera(reg)
Print the degrees of freedom for the test.
>>> testresult['df']
2
Print the test statistic.
>>> print("%1.3f"%testresult['jb'])
1.836
Print the associated p-value.
>>> print("%1.4f"%testresult['pvalue'])
0.3994
"""
n = reg.n # (scalar) number of observations
u = reg.u # (array) residuals from regression
u2 = u ** 2
u3 = u ** 3
u4 = u ** 4
mu2 = np.mean(u2)
mu3 = np.mean(u3)
mu4 = np.mean(u4)
S = mu3 / (mu2 ** (1.5)) # skewness measure
K = (mu4 / (mu2 ** 2)) # kurtosis measure
jb = n * (((S ** 2) / 6) + ((K - 3) ** 2) / 24)
pvalue = stats.chisqprob(jb, 2)
jb_result = {"df": 2, "jb": jb, 'pvalue': pvalue}
return jb_result
[docs]def breusch_pagan(reg, z=None):
"""
Calculates the Breusch-Pagan test statistic to check for
heteroscedasticity. :cite:`Breusch1979`
Parameters
----------
reg : regression object
output instance from a regression model
z : array
optional input for specifying an alternative set of
variables (Z) to explain the observed variance. By
default this is a matrix of the squared explanatory
variables (X**2) with a constant added to the first
column if not already present. In the default case,
the explanatory variables are squared to eliminate
negative values.
Returns
-------
bp_result : dictionary
contains the statistic (bp) for the test and the
associated p-value (p-value)
bp : float
scalar value for the Breusch-Pagan test statistic
df : integer
degrees of freedom associated with the test (k)
pvalue : float
p-value associated with the statistic (chi^2
distributed with k df)
Notes
-----
x attribute in the reg object must have a constant term included. This is
standard for spreg.OLS so no testing done to confirm constant.
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"), "r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the Breusch-Pagan test for heteroscedasticity.
>>> testresult = spreg.breusch_pagan(reg)
Print the degrees of freedom for the test.
>>> testresult['df']
2
Print the test statistic.
>>> print("%1.3f"%testresult['bp'])
7.900
Print the associated p-value.
>>> print("%1.4f"%testresult['pvalue'])
0.0193
"""
e2 = reg.u ** 2
e = reg.u
n = reg.n
k = reg.k
ete = reg.utu
den = ete / n
g = e2 / den - 1.0
if z == None:
x = reg.x
#constant = constant_check(x)
# if constant == False:
# z = np.hstack((np.ones((n,1)),x))**2
# else:
# z = x**2
z = spmultiply(x, x)
else:
#constant = constant_check(z)
# if constant == False:
# z = np.hstack((np.ones((n,1)),z))
pass
n, p = z.shape
# Check to identify any duplicate columns in Z
omitcolumn = []
for i in range(p):
current = z[:, i]
for j in range(p):
check = z[:, j]
if i < j:
test = abs(current - check).sum()
if test == 0:
omitcolumn.append(j)
uniqueomit = set(omitcolumn)
omitcolumn = list(uniqueomit)
# Now the identified columns must be removed (done in reverse to
# prevent renumbering)
omitcolumn.sort()
omitcolumn.reverse()
for c in omitcolumn:
z = np.delete(z, c, 1)
n, p = z.shape
df = p - 1
# Now that the variables are prepared, we calculate the statistic
zt = np.transpose(z)
gt = np.transpose(g)
gtz = np.dot(gt, z)
ztg = np.dot(zt, g)
ztz = np.dot(zt, z)
ztzi = la.inv(ztz)
part1 = np.dot(gtz, ztzi)
part2 = np.dot(part1, ztg)
bp_array = 0.5 * part2
bp = bp_array[0, 0]
pvalue = stats.chisqprob(bp, df)
bp_result = {'df': df, 'bp': bp, 'pvalue': pvalue}
return bp_result
[docs]def white(reg):
"""
Calculates the White test to check for heteroscedasticity. :cite:`White1980`
Parameters
----------
reg : regression object
output instance from a regression model
Returns
-------
white_result : dictionary
contains the statistic (white), degrees of freedom
(df) and the associated p-value (pvalue) for the
White test.
white : float
scalar value for the White test statistic.
df : integer
degrees of freedom associated with the test
pvalue : float
p-value associated with the statistic (chi^2
distributed with k df)
Notes
-----
x attribute in the reg object must have a constant term included. This is
standard for spreg.OLS so no testing done to confirm constant.
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the White test for heteroscedasticity.
>>> testresult = spreg.white(reg)
Print the degrees of freedom for the test.
>>> print(testresult['df'])
5
Print the test statistic.
>>> print("%1.3f"%testresult['wh'])
19.946
Print the associated p-value.
>>> print("%1.4f"%testresult['pvalue'])
0.0013
"""
e = reg.u ** 2
k = int(reg.k)
n = int(reg.n)
y = reg.y
X = reg.x
#constant = constant_check(X)
# Check for constant, if none add one, see Greene 2003, pg. 222
# if constant == False:
# X = np.hstack((np.ones((n,1)),X))
# Check for multicollinearity in the X matrix
ci = condition_index(reg)
if ci > 30:
white_result = "Not computed due to multicollinearity."
return white_result
# Compute cross-products and squares of the regression variables
if type(X).__name__ == 'ndarray':
A = np.zeros((n, (k * (k + 1)) // 2))
elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix':
# this is probably inefficient
A = SP.lil_matrix((n, (k * (k + 1)) // 2))
else:
raise Exception("unknown X type, %s" % type(X).__name__)
counter = 0
for i in range(k):
for j in range(i, k):
v = spmultiply(X[:, i], X[:, j], False)
A[:, counter] = v
counter += 1
# Append the original variables
A = sphstack(X, A) # note: this also converts a LIL to CSR
n, k = A.shape
# Check to identify any duplicate or constant columns in A
omitcolumn = []
for i in range(k):
current = A[:, i]
# remove all constant terms (will add a constant back later)
if spmax(current) == spmin(current):
omitcolumn.append(i)
pass
# do not allow duplicates
for j in range(k):
check = A[:, j]
if i < j:
test = abs(current - check).sum()
if test == 0:
omitcolumn.append(j)
uniqueomit = set(omitcolumn)
omitcolumn = list(uniqueomit)
# Now the identified columns must be removed
if type(A).__name__ == 'ndarray':
A = np.delete(A, omitcolumn, 1)
elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix':
# this is probably inefficient
keepcolumn = list(range(k))
for i in omitcolumn:
keepcolumn.remove(i)
A = A[:, keepcolumn]
else:
raise Exception("unknown A type, %s" % type(X).__name__)
A = sphstack(np.ones((A.shape[0], 1)), A) # add a constant back in
n, k = A.shape
# Conduct the auxiliary regression and calculate the statistic
from . import ols as OLS
aux_reg = OLS.BaseOLS(e, A)
aux_r2 = r2(aux_reg)
wh = aux_r2 * n
df = k - 1
pvalue = stats.chisqprob(wh, df)
white_result = {'df': df, 'wh': wh, 'pvalue': pvalue}
return white_result
[docs]def koenker_bassett(reg, z=None):
"""
Calculates the Koenker-Bassett test statistic to check for
heteroscedasticity. :cite:`Koenker1982,Greene2003`
Parameters
----------
reg : regression output
output from an instance of a regression class
z : array
optional input for specifying an alternative set of
variables (Z) to explain the observed variance. By
default this is a matrix of the squared explanatory
variables (X**2) with a constant added to the first
column if not already present. In the default case,
the explanatory variables are squared to eliminate
negative values.
Returns
-------
kb_result : dictionary
contains the statistic (kb), degrees of freedom (df)
and the associated p-value (pvalue) for the test.
kb : float
scalar value for the Koenker-Bassett test statistic.
df : integer
degrees of freedom associated with the test
pvalue : float
p-value associated with the statistic (chi^2
distributed)
Notes
-----
x attribute in the reg object must have a constant term included. This is
standard for spreg.OLS so no testing done to confirm constant.
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the Koenker-Bassett test for heteroscedasticity.
>>> testresult = spreg.koenker_bassett(reg)
Print the degrees of freedom for the test.
>>> testresult['df']
2
Print the test statistic.
>>> print("%1.3f"%testresult['kb'])
5.694
Print the associated p-value.
>>> print("%1.4f"%testresult['pvalue'])
0.0580
"""
# The notation here matches that of Greene (2003).
u = reg.u ** 2
e = reg.u
n = reg.n
k = reg.k
x = reg.x
ete = reg.utu
#constant = constant_check(x)
ubar = ete / n
ubari = ubar * np.ones((n, 1))
g = u - ubari
v = (1.0 / n) * np.sum((u - ubar) ** 2)
if z == None:
x = reg.x
#constant = constant_check(x)
# if constant == False:
# z = np.hstack((np.ones((n,1)),x))**2
# else:
# z = x**2
z = spmultiply(x, x)
else:
#constant = constant_check(z)
# if constant == False:
# z = np.hstack((np.ones((n,1)),z))
pass
n, p = z.shape
# Check to identify any duplicate columns in Z
omitcolumn = []
for i in range(p):
current = z[:, i]
for j in range(p):
check = z[:, j]
if i < j:
test = abs(current - check).sum()
if test == 0:
omitcolumn.append(j)
uniqueomit = set(omitcolumn)
omitcolumn = list(uniqueomit)
# Now the identified columns must be removed (done in reverse to
# prevent renumbering)
omitcolumn.sort()
omitcolumn.reverse()
for c in omitcolumn:
z = np.delete(z, c, 1)
n, p = z.shape
df = p - 1
# Conduct the auxiliary regression.
zt = np.transpose(z)
gt = np.transpose(g)
gtz = np.dot(gt, z)
ztg = np.dot(zt, g)
ztz = np.dot(zt, z)
ztzi = la.inv(ztz)
part1 = np.dot(gtz, ztzi)
part2 = np.dot(part1, ztg)
kb_array = (1.0 / v) * part2
kb = kb_array[0, 0]
pvalue = stats.chisqprob(kb, df)
kb_result = {'kb': kb, 'df': df, 'pvalue': pvalue}
return kb_result
[docs]def vif(reg):
"""
Calculates the variance inflation factor for each independent variable.
For the ease of indexing the results, the constant is currently
included. This should be omitted when reporting the results to the
output text. :cite:`Greene2003`
Parameters
----------
reg : regression object
output instance from a regression model
Returns
-------
vif_result : list of tuples
each tuple includes the vif and the tolerance, the
order of the variables corresponds to their order in
the reg.x matrix
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
Read the DBF associated with the Columbus data.
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
Create the dependent variable vector.
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
Create the matrix of independent variables.
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
Run an OLS regression.
>>> reg = OLS(y,X)
Calculate the variance inflation factor (VIF).
>>> testresult = spreg.vif(reg)
Select the tuple for the income variable.
>>> incvif = testresult[1]
Print the VIF for income.
>>> print("%12.12f"%incvif[0])
1.333117497189
Print the tolerance for income.
>>> print("%12.12f"%incvif[1])
0.750121427487
Repeat for the home value variable.
>>> hovalvif = testresult[2]
>>> print("%12.12f"%hovalvif[0])
1.333117497189
>>> print("%12.12f"%hovalvif[1])
0.750121427487
"""
X = reg.x
n, k = X.shape
vif_result = []
for j in range(k):
Z = X.copy()
Z = np.delete(Z, j, 1)
y = X[:, j]
from . import ols as OLS
aux = OLS.BaseOLS(y, Z)
mean_y = aux.mean_y
utu = aux.utu
ss_tot = sum((y - mean_y) ** 2)
if ss_tot == 0:
resj = MISSINGVALUE
else:
r2aux = 1 - utu / ss_tot
tolj = 1 - r2aux
vifj = 1 / tolj
resj = (vifj, tolj)
vif_result.append(resj)
return vif_result
[docs]def constant_check(array):
"""
Checks to see numpy array includes a constant.
Parameters
----------
array : array
an array of variables to be inspected
Returns
-------
constant : boolean
true signifies the presence of a constant
Example
-------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import spreg
>>> from spreg import OLS
>>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
>>> y = np.array(db.by_col("CRIME"))
>>> y = np.reshape(y, (49,1))
>>> X = []
>>> X.append(db.by_col("INC"))
>>> X.append(db.by_col("HOVAL"))
>>> X = np.array(X).T
>>> reg = OLS(y,X)
>>> spreg.constant_check(reg.x)
True
"""
n, k = array.shape
constant = False
for j in range(k):
variable = array[:, j]
varmin = variable.min()
varmax = variable.max()
if varmin == varmax:
constant = True
break
return constant
[docs]def likratiotest(reg0, reg1):
"""
Likelihood ratio test statistic :cite:`Greene2003`
Parameters
----------
reg0 : regression object
for constrained model (H0)
reg1 : regression object
for unconstrained model (H1)
Returns
-------
likratio : dictionary
contains the statistic (likr), the degrees of
freedom (df) and the p-value (pvalue)
likr : float
likelihood ratio statistic
df : integer
degrees of freedom
p-value : float
p-value
Examples
--------
>>> import numpy as np
>>> import libpysal
>>> from libpysal import examples
>>> import scipy.stats as stats
>>> from spreg import ML_Lag, OLS
>>> from spreg import likratiotest
Use the baltim sample data set
>>> db = libpysal.io.open(examples.get_path("baltim.dbf"),'r')
>>> y_name = "PRICE"
>>> y = np.array(db.by_col(y_name)).T
>>> y.shape = (len(y),1)
>>> x_names = ["NROOM","NBATH","PATIO","FIREPL","AC","GAR","AGE","LOTSZ","SQFT"]
>>> x = np.array([db.by_col(var) for var in x_names]).T
>>> ww = libpysal.io.open(examples.get_path("baltim_q.gal"))
>>> w = ww.read()
>>> ww.close()
>>> w.transform = 'r'
OLS regression
>>> ols1 = OLS(y,x)
ML Lag regression
>>> mllag1 = ML_Lag(y,x,w)
>>> lr = likratiotest(ols1,mllag1)
>>> print("Likelihood Ratio Test: {0:.4f} df: {1} p-value: {2:.4f}".format(lr["likr"],lr["df"],lr["p-value"]))
Likelihood Ratio Test: 44.5721 df: 1 p-value: 0.0000
"""
likratio = {}
try:
likr = 2.0 * (reg1.logll - reg0.logll)
except AttributeError:
raise Exception("Missing or improper log-likelihoods in regression objects")
if likr < 0.0: # always enforces positive likelihood ratio
likr = -likr
pvalue = stats.chisqprob(likr, 1)
likratio = {"likr": likr, "df": 1, "p-value": pvalue}
return likratio
def _test():
import doctest
doctest.testmod()
if __name__ == '__main__':
_test()