Source code for spreg.diagnostics

"""
Diagnostics for regression estimations. 
        
"""
__author__ = "Luc Anselin luc.anselin@asu.edu, Nicholas Malizia nicholas.malizia@asu.edu "

from libpysal.common import *
import scipy.sparse as SP
from math import sqrt, pi
from .utils import spmultiply, sphstack, spmin, spmax


__all__ = [
    "f_stat", "t_stat", "r2", "ar2", "se_betas", "log_likelihood", "akaike", "schwarz",
    "condition_index", "jarque_bera", "breusch_pagan", "white", "koenker_bassett", "vif", "likratiotest",
    "constant_check"]


[docs]def f_stat(reg):
    """
    Calculates the f-statistic and associated p-value of the
    regression. :cite:`Greene2003`.
    (For two stage least squares see f_stat_tsls)

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    ----------
    fs_result       : tuple
                      includes value of F statistic and associated p-value

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data. 

    >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.    

    >>> reg = OLS(y,X)

    Calculate the F-statistic for the regression. 

    >>> testresult = spreg.f_stat(reg)

    Print the results tuple, including the statistic and its significance.

    >>> print("%12.12f"%testresult[0],"%12.12f"%testresult[1])
    28.385629224695 0.000000009341

    """
    k = reg.k            # (scalar) number of ind. vars (includes constant)
    n = reg.n            # (scalar) number of observations
    utu = reg.utu        # (scalar) residual sum of squares
    predy = reg.predy    # (array) vector of predicted values (n x 1)
    mean_y = reg.mean_y  # (scalar) mean of dependent observations
    Q = utu
    U = np.sum((predy - mean_y) ** 2)
    fStat = (U / (k - 1)) / (Q / (n - k))
    pValue = stats.f.sf(fStat, k - 1, n - k)
    fs_result = (fStat, pValue)
    return fs_result


def t_stat(reg, z_stat=False):
    """
    Calculates the t-statistics (or z-statistics) and associated
    p-values. :cite:`Greene2003`

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model
    z_stat          : boolean
                      If True run z-stat instead of t-stat

    Returns
    -------
    ts_result       : list of tuples
                      each tuple includes value of t statistic (or z
                      statistic) and associated p-value

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data. 

    >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.    

    >>> reg = OLS(y,X)

    Calculate t-statistics for the regression coefficients. 

    >>> testresult = spreg.t_stat(reg)

    Print the tuples that contain the t-statistics and their significances.

    >>> print("%12.12f"%testresult[0][0], "%12.12f"%testresult[0][1], "%12.12f"%testresult[1][0], "%12.12f"%testresult[1][1], "%12.12f"%testresult[2][0], "%12.12f"%testresult[2][1])
    14.490373143689 0.000000000000 -4.780496191297 0.000018289595 -2.654408642718 0.010874504910
    """

    k = reg.k           # (scalar) number of ind. vars (includes constant)
    n = reg.n           # (scalar) number of observations
    vm = reg.vm         # (array) coefficients of variance matrix (k x k)
    betas = reg.betas   # (array) coefficients of the regressors (1 x k)
    variance = vm.diagonal()
    tStat = betas[list(range(0, len(vm)))].reshape(len(vm),) / np.sqrt(variance)
    ts_result = []
    for t in tStat:
        if z_stat:
            ts_result.append((t, stats.norm.sf(abs(t)) * 2))
        else:
            ts_result.append((t, stats.t.sf(abs(t), n - k) * 2))
    return ts_result


[docs]def r2(reg):
    """
    Calculates the R^2 value for the regression. :cite:`Greene2003` 

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    ----------
    r2_result       : float
                      value of the coefficient of determination for the
                      regression 

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector.

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression. 

    >>> reg = OLS(y,X)

    Calculate the R^2 value for the regression. 

    >>> testresult = spreg.r2(reg)

    Print the result. 

    >>> print("%1.8f"%testresult)
    0.55240404

    """
    y = reg.y               # (array) vector of dep observations (n x 1)
    mean_y = reg.mean_y     # (scalar) mean of dep observations
    utu = reg.utu           # (scalar) residual sum of squares
    ss_tot = ((y - mean_y) ** 2).sum(0)
    r2 = 1 - utu / ss_tot
    r2_result = r2[0]
    return r2_result


[docs]def ar2(reg):
    """
    Calculates the adjusted R^2 value for the regression. :cite:`Greene2003`

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model   

    Returns
    ----------
    ar2_result      : float
                      value of R^2 adjusted for the number of explanatory
                      variables.

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression. 

    >>> reg = OLS(y,X)

    Calculate the adjusted R^2 value for the regression. 
    >>> testresult = spreg.ar2(reg)

    Print the result. 

    >>> print("%1.8f"%testresult)
    0.53294335

    """
    k = reg.k       # (scalar) number of ind. variables (includes constant)
    n = reg.n       # (scalar) number of observations
    ar2_result = 1 - (1 - r2(reg)) * (n - 1) / (n - k)
    return ar2_result


[docs]def se_betas(reg):
    """
    Calculates the standard error of the regression coefficients. :cite:`Greene2003`

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    ----------
    se_result       : array
                      includes standard errors of each coefficient (1 x k)

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data. 

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector.

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression. 

    >>> reg = OLS(y,X)

    Calculate the standard errors of the regression coefficients. 

    >>> testresult = spreg.se_betas(reg)

    Print the vector of standard errors. 

    >>> testresult
    array([4.73548613, 0.33413076, 0.10319868])

    """
    vm = reg.vm         # (array) coefficients of variance matrix (k x k)
    variance = vm.diagonal()
    se_result = np.sqrt(variance)
    return se_result


[docs]def log_likelihood(reg):
    """
    Calculates the log-likelihood value for the regression. :cite:`Greene2003` 

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------
    ll_result       : float
                      value for the log-likelihood of the regression.

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data. 

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the log-likelihood for the regression. 

    >>> testresult = spreg.log_likelihood(reg)

    Print the result. 

    >>> testresult
    -187.3772388121491

    """
    n = reg.n       # (scalar) number of observations
    utu = reg.utu   # (scalar) residual sum of squares
    ll_result = -0.5 * \
        (n * (np.log(2 * pi)) + n * np.log(utu / n) + (utu / (utu / n)))
    return ll_result


[docs]def akaike(reg):
    """
    Calculates the Akaike Information Criterion. :cite:`Akaike1974`

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------
    aic_result      : scalar
                      value for Akaike Information Criterion of the
                      regression. 

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the Akaike Information Criterion (AIC). 

    >>> testresult = spreg.akaike(reg)

    Print the result. 

    >>> testresult
    380.7544776242982

    """
    k = reg.k       # (scalar) number of explanatory vars (including constant)
    try:   # ML estimation, logll already exists
        # spatial coefficient included in k
        aic_result = 2.0 * k - 2.0 * reg.logll
    except AttributeError:           # OLS case
        n = reg.n       # (scalar) number of observations
        utu = reg.utu   # (scalar) residual sum of squares
        aic_result = 2 * k + n * (np.log((2 * np.pi * utu) / n) + 1)
    return aic_result


[docs]def schwarz(reg):
    """
    Calculates the Schwarz Information Criterion. :cite:`Schwarz1978`

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------
    bic_result      : scalar
                      value for Schwarz (Bayesian) Information Criterion of
                      the regression. 

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the Schwarz Information Criterion. 

    >>> testresult = spreg.schwarz(reg)

    Print the results. 

    >>> np.round(testresult, 5)
    386.42994

    """
    n = reg.n      # (scalar) number of observations
    k = reg.k      # (scalar) number of ind. variables (including constant)
    try:  # ML case logll already computed
        # spatial coeff included in k
        sc_result = k * np.log(n) - 2.0 * reg.logll
    except AttributeError:          # OLS case
        utu = reg.utu  # (scalar) residual sum of squares
        sc_result = k * np.log(n) + n * (np.log((2 * np.pi * utu) / n) + 1)
    return sc_result


[docs]def condition_index(reg):
    """
    Calculates the multicollinearity condition index according to Belsey,
    Kuh and Welsh (1980) :cite:`Belsley1980`.

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------
    ci_result       : float
                      scalar value for the multicollinearity condition
                      index. 

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the condition index to check for multicollinearity.

    >>> testresult = spreg.condition_index(reg)

    Print the result.

    >>> print("%1.3f"%testresult)
    6.542

    """
    if hasattr(reg, 'xtx'):
        xtx = reg.xtx   # (array) k x k projection matrix (includes constant)
    elif hasattr(reg, 'hth'):
        xtx = reg.hth   # (array) k x k projection matrix (includes constant)
    diag = np.diagonal(xtx)
    scale = xtx / diag
    eigval = np.linalg.eigvals(scale)
    max_eigval = max(eigval)
    min_eigval = min(eigval)
    ci_result = sqrt(max_eigval / min_eigval)
    return ci_result


[docs]def jarque_bera(reg):
    """
    Jarque-Bera test for normality in the residuals. :cite:`Jarque1980`

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    ------- 
    jb_result       : dictionary
                      contains the statistic (jb) for the Jarque-Bera test
                      and the associated p-value (p-value)
    df              : integer
                      degrees of freedom for the test (always 2)
    jb              : float
                      value of the test statistic
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed with 2 df)

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"), "r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the Jarque-Bera test for normality of residuals.

    >>> testresult = spreg.jarque_bera(reg)

    Print the degrees of freedom for the test.

    >>> testresult['df']
    2

    Print the test statistic. 

    >>> print("%1.3f"%testresult['jb'])
    1.836

    Print the associated p-value.

    >>> print("%1.4f"%testresult['pvalue'])
    0.3994

    """
    n = reg.n               # (scalar) number of observations
    u = reg.u               # (array) residuals from regression
    u2 = u ** 2
    u3 = u ** 3
    u4 = u ** 4
    mu2 = np.mean(u2)
    mu3 = np.mean(u3)
    mu4 = np.mean(u4)
    S = mu3 / (mu2 ** (1.5))    # skewness measure
    K = (mu4 / (mu2 ** 2))      # kurtosis measure
    jb = n * (((S ** 2) / 6) + ((K - 3) ** 2) / 24)
    pvalue = stats.chisqprob(jb, 2)
    jb_result = {"df": 2, "jb": jb, 'pvalue': pvalue}
    return jb_result


[docs]def breusch_pagan(reg, z=None):
    """
    Calculates the Breusch-Pagan test statistic to check for
    heteroscedasticity. :cite:`Breusch1979`

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model
    z               : array
                      optional input for specifying an alternative set of
                      variables (Z) to explain the observed variance. By
                      default this is a matrix of the squared explanatory
                      variables (X**2) with a constant added to the first
                      column if not already present. In the default case,
                      the explanatory variables are squared to eliminate
                      negative values. 

    Returns
    -------
    bp_result       : dictionary
                      contains the statistic (bp) for the test and the
                      associated p-value (p-value)
    bp              : float
                      scalar value for the Breusch-Pagan test statistic
    df              : integer
                      degrees of freedom associated with the test (k)
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed with k df)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"), "r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the Breusch-Pagan test for heteroscedasticity.

    >>> testresult = spreg.breusch_pagan(reg)

    Print the degrees of freedom for the test.

    >>> testresult['df']
    2

    Print the test statistic.

    >>> print("%1.3f"%testresult['bp'])
    7.900

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0193

    """
    e2 = reg.u ** 2
    e = reg.u
    n = reg.n
    k = reg.k
    ete = reg.utu

    den = ete / n
    g = e2 / den - 1.0

    if z == None:
        x = reg.x
        #constant = constant_check(x)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),x))**2
        # else:
        #    z = x**2
        z = spmultiply(x, x)
    else:
        #constant = constant_check(z)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),z))
        pass

    n, p = z.shape

    # Check to identify any duplicate columns in Z
    omitcolumn = []
    for i in range(p):
        current = z[:, i]
        for j in range(p):
            check = z[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)

    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed (done in reverse to
    # prevent renumbering)
    omitcolumn.sort()
    omitcolumn.reverse()
    for c in omitcolumn:
        z = np.delete(z, c, 1)
    n, p = z.shape

    df = p - 1

    # Now that the variables are prepared, we calculate the statistic
    zt = np.transpose(z)
    gt = np.transpose(g)
    gtz = np.dot(gt, z)
    ztg = np.dot(zt, g)
    ztz = np.dot(zt, z)
    ztzi = la.inv(ztz)

    part1 = np.dot(gtz, ztzi)
    part2 = np.dot(part1, ztg)
    bp_array = 0.5 * part2
    bp = bp_array[0, 0]

    pvalue = stats.chisqprob(bp, df)
    bp_result = {'df': df, 'bp': bp, 'pvalue': pvalue}
    return bp_result


[docs]def white(reg):
    """
    Calculates the White test to check for heteroscedasticity. :cite:`White1980`

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------
    white_result    : dictionary
                      contains the statistic (white), degrees of freedom
                      (df) and the associated p-value (pvalue) for the
                      White test. 
    white           : float
                      scalar value for the White test statistic.
    df              : integer
                      degrees of freedom associated with the test
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed with k df)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the White test for heteroscedasticity.

    >>> testresult = spreg.white(reg)

    Print the degrees of freedom for the test.

    >>> print(testresult['df'])
    5

    Print the test statistic.

    >>> print("%1.3f"%testresult['wh'])
    19.946

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0013

    """
    e = reg.u ** 2
    k = int(reg.k)
    n = int(reg.n)
    y = reg.y
    X = reg.x
    #constant = constant_check(X)

    # Check for constant, if none add one, see Greene 2003, pg. 222
    # if constant == False:
    #    X = np.hstack((np.ones((n,1)),X))

    # Check for multicollinearity in the X matrix
    ci = condition_index(reg)
    if ci > 30:
        white_result = "Not computed due to multicollinearity."
        return white_result

    # Compute cross-products and squares of the regression variables
    if type(X).__name__ == 'ndarray':
        A = np.zeros((n, (k * (k + 1)) // 2))
    elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix':
        # this is probably inefficient
        A = SP.lil_matrix((n, (k * (k + 1)) // 2))
    else:
        raise Exception("unknown X type, %s" % type(X).__name__)
    counter = 0
    for i in range(k):
        for j in range(i, k):
            v = spmultiply(X[:, i], X[:, j], False)
            A[:, counter] = v
            counter += 1

    # Append the original variables
    A = sphstack(X, A)   # note: this also converts a LIL to CSR
    n, k = A.shape

    # Check to identify any duplicate or constant columns in A
    omitcolumn = []
    for i in range(k):
        current = A[:, i]
        # remove all constant terms (will add a constant back later)
        if spmax(current) == spmin(current):
            omitcolumn.append(i)
            pass
        # do not allow duplicates
        for j in range(k):
            check = A[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)
    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed
    if type(A).__name__ == 'ndarray':
        A = np.delete(A, omitcolumn, 1)
    elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix':
        # this is probably inefficient
        keepcolumn = list(range(k))
        for i in omitcolumn:
            keepcolumn.remove(i)
        A = A[:, keepcolumn]
    else:
        raise Exception("unknown A type, %s" % type(X).__name__)
    A = sphstack(np.ones((A.shape[0], 1)), A)   # add a constant back in
    n, k = A.shape

    # Conduct the auxiliary regression and calculate the statistic
    from . import ols as OLS
    aux_reg = OLS.BaseOLS(e, A)
    aux_r2 = r2(aux_reg)
    wh = aux_r2 * n
    df = k - 1
    pvalue = stats.chisqprob(wh, df)
    white_result = {'df': df, 'wh': wh, 'pvalue': pvalue}
    return white_result


[docs]def koenker_bassett(reg, z=None):
    """
    Calculates the Koenker-Bassett test statistic to check for
    heteroscedasticity. :cite:`Koenker1982,Greene2003`

    Parameters
    ----------
    reg             : regression output
                      output from an instance of a regression class
    z               : array
                      optional input for specifying an alternative set of
                      variables (Z) to explain the observed variance. By
                      default this is a matrix of the squared explanatory
                      variables (X**2) with a constant added to the first
                      column if not already present. In the default case,
                      the explanatory variables are squared to eliminate
                      negative values. 

    Returns
    -------
    kb_result       : dictionary
                      contains the statistic (kb), degrees of freedom (df)
                      and the associated p-value (pvalue) for the test. 
    kb              : float
                      scalar value for the Koenker-Bassett test statistic.
    df              : integer
                      degrees of freedom associated with the test
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the Koenker-Bassett test for heteroscedasticity.

    >>> testresult = spreg.koenker_bassett(reg)

    Print the degrees of freedom for the test.

    >>> testresult['df']
    2

    Print the test statistic.

    >>> print("%1.3f"%testresult['kb'])
    5.694

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0580

    """
    # The notation here matches that of Greene (2003).
    u = reg.u ** 2
    e = reg.u
    n = reg.n
    k = reg.k
    x = reg.x
    ete = reg.utu
    #constant = constant_check(x)

    ubar = ete / n
    ubari = ubar * np.ones((n, 1))
    g = u - ubari
    v = (1.0 / n) * np.sum((u - ubar) ** 2)

    if z == None:
        x = reg.x
        #constant = constant_check(x)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),x))**2
        # else:
        #    z = x**2
        z = spmultiply(x, x)
    else:
        #constant = constant_check(z)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),z))
        pass

    n, p = z.shape

    # Check to identify any duplicate columns in Z
    omitcolumn = []
    for i in range(p):
        current = z[:, i]
        for j in range(p):
            check = z[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)

    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed (done in reverse to
    # prevent renumbering)
    omitcolumn.sort()
    omitcolumn.reverse()
    for c in omitcolumn:
        z = np.delete(z, c, 1)
    n, p = z.shape

    df = p - 1

    # Conduct the auxiliary regression.
    zt = np.transpose(z)
    gt = np.transpose(g)
    gtz = np.dot(gt, z)
    ztg = np.dot(zt, g)
    ztz = np.dot(zt, z)
    ztzi = la.inv(ztz)

    part1 = np.dot(gtz, ztzi)
    part2 = np.dot(part1, ztg)
    kb_array = (1.0 / v) * part2
    kb = kb_array[0, 0]

    pvalue = stats.chisqprob(kb, df)
    kb_result = {'kb': kb, 'df': df, 'pvalue': pvalue}
    return kb_result


[docs]def vif(reg):
    """
    Calculates the variance inflation factor for each independent variable.
    For the ease of indexing the results, the constant is currently
    included. This should be omitted when reporting the results to the
    output text. :cite:`Greene2003`

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------    
    vif_result      : list of tuples
                      each tuple includes the vif and the tolerance, the
                      order of the variables corresponds to their order in
                      the reg.x matrix

    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS

    Read the DBF associated with the Columbus data.

    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the variance inflation factor (VIF). 
    >>> testresult = spreg.vif(reg)

    Select the tuple for the income variable. 

    >>> incvif = testresult[1]

    Print the VIF for income. 

    >>> print("%12.12f"%incvif[0])
    1.333117497189

    Print the tolerance for income. 

    >>> print("%12.12f"%incvif[1])
    0.750121427487

    Repeat for the home value variable. 

    >>> hovalvif = testresult[2]
    >>> print("%12.12f"%hovalvif[0])
    1.333117497189
    >>> print("%12.12f"%hovalvif[1])
    0.750121427487

    """
    X = reg.x
    n, k = X.shape
    vif_result = []

    for j in range(k):
        Z = X.copy()
        Z = np.delete(Z, j, 1)
        y = X[:, j]
        from . import ols as OLS
        aux = OLS.BaseOLS(y, Z)
        mean_y = aux.mean_y
        utu = aux.utu
        ss_tot = sum((y - mean_y) ** 2)
        if ss_tot == 0:
            resj = MISSINGVALUE
        else:
            r2aux = 1 - utu / ss_tot
            tolj = 1 - r2aux
            vifj = 1 / tolj
            resj = (vifj, tolj)
        vif_result.append(resj)
    return vif_result


[docs]def constant_check(array):
    """
    Checks to see numpy array includes a constant.

    Parameters
    ----------
    array           : array
                      an array of variables to be inspected 

    Returns
    -------
    constant        : boolean
                      true signifies the presence of a constant

    Example
    -------

    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import spreg
    >>> from spreg import OLS
    >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))
    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T
    >>> reg = OLS(y,X)
    >>> spreg.constant_check(reg.x)
    True

    """

    n, k = array.shape
    constant = False
    for j in range(k):
        variable = array[:, j]
        varmin = variable.min()
        varmax = variable.max()
        if varmin == varmax:
            constant = True
            break
    return constant


[docs]def likratiotest(reg0, reg1):
    """
    Likelihood ratio test statistic :cite:`Greene2003`

    Parameters
    ----------
    reg0         : regression object
                   for constrained model (H0)
    reg1         : regression object
                   for unconstrained model (H1)

    Returns
    -------

    likratio     : dictionary
                   contains the statistic (likr), the degrees of
                   freedom (df) and the p-value (pvalue)
    likr         : float
                   likelihood ratio statistic
    df           : integer
                   degrees of freedom
    p-value      : float
                   p-value

    Examples
    --------

    >>> import numpy as np
    >>> import libpysal
    >>> from libpysal import examples
    >>> import scipy.stats as stats
    >>> from spreg import ML_Lag, OLS
    >>> from spreg import likratiotest

    Use the baltim sample data set

    >>> db = libpysal.io.open(examples.get_path("baltim.dbf"),'r')
    >>> y_name = "PRICE"
    >>> y = np.array(db.by_col(y_name)).T
    >>> y.shape = (len(y),1)
    >>> x_names = ["NROOM","NBATH","PATIO","FIREPL","AC","GAR","AGE","LOTSZ","SQFT"]
    >>> x = np.array([db.by_col(var) for var in x_names]).T
    >>> ww = libpysal.io.open(examples.get_path("baltim_q.gal"))
    >>> w = ww.read()
    >>> ww.close()
    >>> w.transform = 'r'

    OLS regression

    >>> ols1 = OLS(y,x)

    ML Lag regression

    >>> mllag1 = ML_Lag(y,x,w)

    >>> lr = likratiotest(ols1,mllag1)

    >>> print("Likelihood Ratio Test: {0:.4f}       df: {1}        p-value: {2:.4f}".format(lr["likr"],lr["df"],lr["p-value"]))
    Likelihood Ratio Test: 44.5721       df: 1        p-value: 0.0000

    """

    likratio = {}

    try:
        likr = 2.0 * (reg1.logll - reg0.logll)
    except AttributeError:
        raise Exception("Missing or improper log-likelihoods in regression objects")
    if likr < 0.0:  # always enforces positive likelihood ratio
        likr = -likr
    pvalue = stats.chisqprob(likr, 1)
    likratio = {"likr": likr, "df": 1, "p-value": pvalue}
    return likratio


def _test():
    import doctest
    doctest.testmod()

if __name__ == '__main__':
    _test()