just a place to store things

Statsmodels: statistical modeling and econometrics in Python

Meet larry, the labeled numpy array

Simple comparison of Python and R for a basic OLS analysis

Kuiper test and other tools from circular statistics

Constrained multivariate least-squares optimizer for scipy

Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more

Scipy main repository

Notes used to give tutorials (for example at Euroscipy 2010)

There are problems identifying the overidentified case from the results. HessianInversionWarning, and `converged` can be true score_obs has full rank in the example even if Hessian is not invertible. In a quick check in pdb of example method="hessian" seems to work, converges, Hessian is invertible but with 3 negative/nan bse

The constant detection in `data` handling seems to be correct, `resf2.model.data.k_constant` is 1 for implicit constant in `formula = "apply ~ 0 + pared + public + gpa + C(dummy)"` `resf2.model.data.const_idx` which should mean that there is no explicit constant

Maybe it would be better to raise on constant check of exog, than relying on some warnings that might not get issued.

We are using numerical derivatives and have convergence tolerance, so precision can be too low to get exact or precise-enough non-identification from results.

josef-pkt

comment created time in 15 hours

`+"""+Test  for ordinal models+"""++import numpy as np+import scipy.stats as stats+import pytest+import pandas as pd++from numpy.testing import assert_allclose, assert_equal+from statsmodels.tools.sm_exceptions import (+    HessianInversionWarning, SpecificationWarning)+from .results.results_ordinal_model import data_store as ds+from statsmodels.miscmodels.ordinal_model import OrderedModel++import warnings+warnings.simplefilter("ignore", SpecificationWarning)+++class CheckOrdinalModelMixin(object):++    def test_basic(self):+        # checks basic results againt R MASS package+        n_cat = ds.n_ordinal_cat+        res1 = self.res1+        res2 = self.res2+        # coefficients values, standard errors, t & p values+        assert_allclose(res1.params[:-n_cat + 1],+                        res2.coefficients_val, atol=2e-4)+        assert_allclose(res1.bse[:-n_cat + 1],+                        res2.coefficients_stdE, rtol=0.003, atol=1e-5)+        assert_allclose(res1.tvalues[:-n_cat + 1],+                        res2.coefficients_tval, rtol=0.003, atol=7e-4)+        assert_allclose(res1.pvalues[:-n_cat + 1],+                        res2.coefficients_pval, rtol=0.009, atol=1e-5)+        # thresholds are given with exponentiated increments+        # from the first threshold+        assert_allclose(+            res1.model.transform_threshold_params(res1.params)[1:-1],+            res2.thresholds, atol=4e-4)++        # probabilities+        assert_allclose(res1.predict()[:7, :],+                        res2.prob_pred, atol=5e-5)++    def test_pandas(self):+        # makes sure that the Pandas ecosystem is supported+        res1 = self.res1+        resp = self.resp+        # converges slightly differently why?+        assert_allclose(res1.params, resp.params, atol=1e-10)+        assert_allclose(res1.bse, resp.bse, atol=1e-10)++        assert_allclose(res1.model.endog, resp.model.endog, rtol=1e-10)+        assert_allclose(res1.model.exog, resp.model.exog, rtol=1e-10)++    def test_formula(self):+        # makes sure the "R-way" of writing models is supported+        res1 = self.res1+        resf = self.resf+        # converges slightly differently why? yet e-5 is ok+        assert_allclose(res1.params, resf.params, atol=5e-5)+        assert_allclose(res1.bse, resf.bse, atol=5e-5)++        assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10)+        assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10)++    def test_unordered(self):+        # makes sure that ordered = True is optional for the endog Serie+        # et categories have to be set in the right order+        res1 = self.res1+        resf = self.resu+        # converges slightly differently why?+        assert_allclose(res1.params, resf.params, atol=1e-10)+        assert_allclose(res1.bse, resf.bse, atol=1e-10)++        assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10)+        assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10)++    def test_results_other(self):++        res1 = self.res1  # numpy+        resp = self.resp  # pandas++        param_names_np = ['x1', 'x2', 'x3', '0/1', '1/2']+        param_names_pd = ['pared', 'public', 'gpa', 'unlikely/somewhat likely',+                          'somewhat likely/very likely']++        assert res1.model.data.param_names == param_names_np+        assert self.resp.model.data.param_names == param_names_pd+        assert self.resp.model.endog_names == "apply"++        # results+        if hasattr(self, "pred_table"):+            table = res1.pred_table()+            assert_equal(table.values, self.pred_table)++        # smoke test+        res1.summary()++        # inherited+        tt = res1.t_test(np.eye(len(res1.params)))+        assert_allclose(tt.pvalue, res1.pvalues, rtol=1e-13)++        tt = resp.t_test(['pared', 'public', 'gpa'])  # pandas names+        assert_allclose(tt.pvalue, res1.pvalues[:3], rtol=1e-13)++        pred = res1.predict(exog=res1.model.exog[-5:])+        fitted = res1.predict()+        assert_allclose(pred, fitted[-5:], rtol=1e-13)++        pred = resp.predict(exog=resp.model.data.orig_exog.iloc[-5:])+        fitted = resp.predict()+        assert_allclose(pred, fitted[-5:], rtol=1e-13)++        dataf = self.resf.model.data.frame  # is a dict+        dataf_df = pd.DataFrame.from_dict(dataf)+        pred = self.resf.predict(exog=dataf_df.iloc[-5:])+        fitted = self.resf.predict()+        assert_allclose(pred, fitted[-5:], rtol=1e-13)+++class TestLogitModel(CheckOrdinalModelMixin):++    @classmethod+    def setup_class(cls):+        data = ds.df+        data_unordered = ds.df_unordered++        # standard fit+        mod = OrderedModel(data['apply'].values.codes,+                           np.asarray(data[['pared', 'public', 'gpa']], float),+                           distr='logit')+        res = mod.fit(method='bfgs', disp=False)+        # standard fit with pandas input+        modp = OrderedModel(data['apply'],+                            data[['pared', 'public', 'gpa']],+                            distr='logit')+        resp = modp.fit(method='bfgs', disp=False)+        # fit with formula+        with warnings.catch_warnings():+            warnings.simplefilter("ignore", SpecificationWarning)+            modf = OrderedModel.from_formula(+                "apply ~ pared + public + gpa - 1",+                data={"apply": data['apply'].values.codes,+                      "pared": data['pared'],+                      "public": data['public'],+                      "gpa": data['gpa']},+                distr='logit')+        resf = modf.fit(method='bfgs', disp=False)+        # fit on data with ordered=False+        modu = OrderedModel(+            data_unordered['apply'].values.codes,+            np.asarray(data_unordered[['pared', 'public', 'gpa']], float),+            distr='logit')+        resu = modu.fit(method='bfgs', disp=False)++        from .results.results_ordinal_model import res_ord_logit as res2+        cls.res2 = res2+        cls.res1 = res+        cls.resp = resp+        cls.resf = resf+        cls.resu = resu+++class TestProbitModel(CheckOrdinalModelMixin):++    @classmethod+    def setup_class(cls):+        data = ds.df+        data_unordered = ds.df_unordered++        mod = OrderedModel(data['apply'].values.codes,+                           np.asarray(data[['pared', 'public', 'gpa']], float),+                           distr='probit')+        res = mod.fit(method='bfgs', disp=False)++        modp = OrderedModel(data['apply'],+                            data[['pared', 'public', 'gpa']],+                            distr='probit')+        resp = modp.fit(method='bfgs', disp=False)++        with warnings.catch_warnings():+            warnings.simplefilter("ignore", SpecificationWarning)+            modf = OrderedModel.from_formula(+                "apply ~ pared + public + gpa - 1",+                data={"apply": data['apply'].values.codes,+                      "pared": data['pared'],+                      "public": data['public'],+                      "gpa": data['gpa']},+                distr='probit')+        resf = modf.fit(method='bfgs', disp=False)++        modu = OrderedModel(+            data_unordered['apply'].values.codes,+            np.asarray(data_unordered[['pared', 'public', 'gpa']], float),+            distr='probit')+        resu = modu.fit(method='bfgs', disp=False)++        from .results.results_ordinal_model import res_ord_probit as res2+        cls.res2 = res2+        cls.res1 = res+        cls.resp = resp+        cls.resf = resf+        cls.resu = resu++        # regression numbers+        cls.pred_table = np.array([[202,  18,   0, 220],+                                   [112,  28,   0, 140],+                                   [ 27,  13,   0,  40],  # noqa+                                   [341,  59,   0, 400]], dtype=np.int64)++    def test_loglikerelated(self):++        res1 = self.res1+        # res2 = self.res2++        mod = res1.model+        fact = 1.1  # evaluate away from optimum+        score1 = mod.score(res1.params * fact)+        score_obs_numdiff = mod.score_obs(res1.params * fact)+        score_obs_exog = mod.score_obs_(res1.params * fact)+        assert_allclose(score_obs_numdiff.sum(0), score1, atol=1e-7)+        assert_allclose(score_obs_exog.sum(0), score1[:mod.k_vars], atol=1e-7)++        # null model+        mod_null = OrderedModel(mod.endog, None,+                                offset=np.zeros(mod.nobs),+                                distr=mod.distr)+        null_params = mod.start_params+        res_null = mod_null.fit(method='bfgs', disp=False)+        assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8)++    def test_formula_categorical(self):++        resp = self.resp+        data = ds.df++        with warnings.catch_warnings():+            warnings.simplefilter("ignore", SpecificationWarning)+            formula = "apply ~ pared + public + gpa - 1"+            modf2 = OrderedModel.from_formula(formula,+                                              data, distr='probit')+        resf2 = modf2.fit(method='bfgs', disp=False)+        assert_allclose(resf2.params, resp.params, atol=1e-8)+        assert modf2.exog_names == resp.model.exog_names+        assert modf2.data.ynames == resp.model.data.ynames+        assert hasattr(modf2.data, "frame")+        assert not hasattr(modf2, "frame")++        with pytest.raises(ValueError):+            with warnings.catch_warnings():+                warnings.simplefilter("ignore", SpecificationWarning)+                OrderedModel.from_formula(+                    "apply ~ pared + public + gpa - 1",+                    data={"apply": np.asarray(data['apply']),+                          "pared": data['pared'],+                          "public": data['public'],+                          "gpa": data['gpa']},+                    distr='probit')+++class TestLogitModelFormula():++    @classmethod+    def setup_class(cls):+        warnings.simplefilter("ignore", SpecificationWarning)+        data = ds.df+        nobs = len(data)+        data["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float)+        # alias to correspond to patsy name+        data["C(dummy)[T.1.0]"] = data["dummy"]+        cls.data = data++        columns = ['C(dummy)[T.1.0]', 'pared', 'public', 'gpa']+        # standard fit+        mod = OrderedModel(data['apply'].values.codes,+                           np.asarray(data[columns], float),+                           distr='logit')+        cls.res = mod.fit(method='bfgs', disp=False)+        # standard fit with pandas input+        modp = OrderedModel(data['apply'],+                            data[columns],+                            distr='logit')+        cls.resp = modp.fit(method='bfgs', disp=False)++    def test_setup(self):+        data = self.data+        resp = self.resp+        fittedvalues = resp.predict()++        formulas = ["apply ~ 1 + pared + public + gpa + C(dummy)",+                    "apply ~ pared + public + gpa + C(dummy)"]+        for formula in formulas:+            modf1 = OrderedModel.from_formula(formula, data, distr='logit')+            resf1 = modf1.fit(method='bfgs')+            summf1 = resf1.summary()+            summf1_str = str(summf1)+            assert resf1.model.exog_names == resp.model.exog_names+            assert resf1.model.data.param_names == resp.model.exog_names+            assert all(name in summf1_str for name in+                       resp.model.data.param_names)+            assert_allclose(resf1.predict(data[:5]), fittedvalues[:5])++        # test over parameterized model with implicit constant+        # warns but doesn't raise+        formula = "apply ~ 0 + pared + public + gpa + C(dummy)"++        with pytest.warns(SpecificationWarning):+            modf2 = OrderedModel.from_formula(formula, data, distr='logit')++        with pytest.warns(HessianInversionWarning):+            resf2 = modf2.fit(method='bfgs')+            assert np.isnan(resf2.bse).all()`
``````        with pytest.warns(HessianInversionWarning):
>           resf2 = modf2.fit(method='bfgs')
E           Failed: DID NOT WARN. No warnings of type (<class 'statsmodels.tools.sm_exceptions.HessianInversionWarning'>,) was emitted. The list of emitted warnings is: [].
``````
josef-pkt

comment created time in 15 hours

PullRequestReviewEvent

amended and force pushed, so `assert np.isnan(resf2.bse).all()` won't be in last commit anymore The problem is that behavior for singular score_obs can depend on numerical details, linalg libraries and floating point noise. That's not consistent across environments.

josef-pkt

comment created time in 20 hours

push eventjosef-pkt/statsmodels

commit sha a1fb8c2d8d5a648bde1574ce3575ee42bf2b2f2c

REF: use SpecificationWarning for formula,

push time in 20 hours

`+"""+Test  for ordinal models+"""++import numpy as np+import scipy.stats as stats+import pytest+import pandas as pd++from numpy.testing import assert_allclose, assert_equal+from statsmodels.tools.sm_exceptions import (+    HessianInversionWarning, SpecificationWarning)+from .results.results_ordinal_model import data_store as ds+from statsmodels.miscmodels.ordinal_model import OrderedModel++import warnings+warnings.simplefilter("ignore", SpecificationWarning)+++class CheckOrdinalModelMixin(object):++    def test_basic(self):+        # checks basic results againt R MASS package+        n_cat = ds.n_ordinal_cat+        res1 = self.res1+        res2 = self.res2+        # coefficients values, standard errors, t & p values+        assert_allclose(res1.params[:-n_cat + 1],+                        res2.coefficients_val, atol=2e-4)+        assert_allclose(res1.bse[:-n_cat + 1],+                        res2.coefficients_stdE, rtol=0.003, atol=1e-5)+        assert_allclose(res1.tvalues[:-n_cat + 1],+                        res2.coefficients_tval, rtol=0.003, atol=7e-4)+        assert_allclose(res1.pvalues[:-n_cat + 1],+                        res2.coefficients_pval, rtol=0.009, atol=1e-5)+        # thresholds are given with exponentiated increments+        # from the first threshold+        assert_allclose(+            res1.model.transform_threshold_params(res1.params)[1:-1],+            res2.thresholds, atol=4e-4)++        # probabilities+        assert_allclose(res1.predict()[:7, :],+                        res2.prob_pred, atol=5e-5)++    def test_pandas(self):+        # makes sure that the Pandas ecosystem is supported+        res1 = self.res1+        resp = self.resp+        # converges slightly differently why?+        assert_allclose(res1.params, resp.params, atol=1e-10)+        assert_allclose(res1.bse, resp.bse, atol=1e-10)++        assert_allclose(res1.model.endog, resp.model.endog, rtol=1e-10)+        assert_allclose(res1.model.exog, resp.model.exog, rtol=1e-10)++    def test_formula(self):+        # makes sure the "R-way" of writing models is supported+        res1 = self.res1+        resf = self.resf+        # converges slightly differently why? yet e-5 is ok+        assert_allclose(res1.params, resf.params, atol=5e-5)+        assert_allclose(res1.bse, resf.bse, atol=5e-5)++        assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10)+        assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10)++    def test_unordered(self):+        # makes sure that ordered = True is optional for the endog Serie+        # et categories have to be set in the right order+        res1 = self.res1+        resf = self.resu+        # converges slightly differently why?+        assert_allclose(res1.params, resf.params, atol=1e-10)+        assert_allclose(res1.bse, resf.bse, atol=1e-10)++        assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10)+        assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10)++    def test_results_other(self):++        res1 = self.res1  # numpy+        resp = self.resp  # pandas++        param_names_np = ['x1', 'x2', 'x3', '0/1', '1/2']+        param_names_pd = ['pared', 'public', 'gpa', 'unlikely/somewhat likely',+                          'somewhat likely/very likely']++        assert res1.model.data.param_names == param_names_np+        assert self.resp.model.data.param_names == param_names_pd+        assert self.resp.model.endog_names == "apply"++        # results+        if hasattr(self, "pred_table"):+            table = res1.pred_table()+            assert_equal(table.values, self.pred_table)++        # smoke test+        res1.summary()++        # inherited+        tt = res1.t_test(np.eye(len(res1.params)))+        assert_allclose(tt.pvalue, res1.pvalues, rtol=1e-13)++        tt = resp.t_test(['pared', 'public', 'gpa'])  # pandas names+        assert_allclose(tt.pvalue, res1.pvalues[:3], rtol=1e-13)++        pred = res1.predict(exog=res1.model.exog[-5:])+        fitted = res1.predict()+        assert_allclose(pred, fitted[-5:], rtol=1e-13)++        pred = resp.predict(exog=resp.model.data.orig_exog.iloc[-5:])+        fitted = resp.predict()+        assert_allclose(pred, fitted[-5:], rtol=1e-13)++        dataf = self.resf.model.data.frame  # is a dict+        dataf_df = pd.DataFrame.from_dict(dataf)+        pred = self.resf.predict(exog=dataf_df.iloc[-5:])+        fitted = self.resf.predict()+        assert_allclose(pred, fitted[-5:], rtol=1e-13)+++class TestLogitModel(CheckOrdinalModelMixin):++    @classmethod+    def setup_class(cls):+        data = ds.df+        data_unordered = ds.df_unordered++        # standard fit+        mod = OrderedModel(data['apply'].values.codes,+                           np.asarray(data[['pared', 'public', 'gpa']], float),+                           distr='logit')+        res = mod.fit(method='bfgs', disp=False)+        # standard fit with pandas input+        modp = OrderedModel(data['apply'],+                            data[['pared', 'public', 'gpa']],+                            distr='logit')+        resp = modp.fit(method='bfgs', disp=False)+        # fit with formula+        with warnings.catch_warnings():+            warnings.simplefilter("ignore", SpecificationWarning)+            modf = OrderedModel.from_formula(+                "apply ~ pared + public + gpa - 1",+                data={"apply": data['apply'].values.codes,+                      "pared": data['pared'],+                      "public": data['public'],+                      "gpa": data['gpa']},+                distr='logit')+        resf = modf.fit(method='bfgs', disp=False)+        # fit on data with ordered=False+        modu = OrderedModel(+            data_unordered['apply'].values.codes,+            np.asarray(data_unordered[['pared', 'public', 'gpa']], float),+            distr='logit')+        resu = modu.fit(method='bfgs', disp=False)++        from .results.results_ordinal_model import res_ord_logit as res2+        cls.res2 = res2+        cls.res1 = res+        cls.resp = resp+        cls.resf = resf+        cls.resu = resu+++class TestProbitModel(CheckOrdinalModelMixin):++    @classmethod+    def setup_class(cls):+        data = ds.df+        data_unordered = ds.df_unordered++        mod = OrderedModel(data['apply'].values.codes,+                           np.asarray(data[['pared', 'public', 'gpa']], float),+                           distr='probit')+        res = mod.fit(method='bfgs', disp=False)++        modp = OrderedModel(data['apply'],+                            data[['pared', 'public', 'gpa']],+                            distr='probit')+        resp = modp.fit(method='bfgs', disp=False)++        with warnings.catch_warnings():+            warnings.simplefilter("ignore", SpecificationWarning)+            modf = OrderedModel.from_formula(+                "apply ~ pared + public + gpa - 1",+                data={"apply": data['apply'].values.codes,+                      "pared": data['pared'],+                      "public": data['public'],+                      "gpa": data['gpa']},+                distr='probit')+        resf = modf.fit(method='bfgs', disp=False)++        modu = OrderedModel(+            data_unordered['apply'].values.codes,+            np.asarray(data_unordered[['pared', 'public', 'gpa']], float),+            distr='probit')+        resu = modu.fit(method='bfgs', disp=False)++        from .results.results_ordinal_model import res_ord_probit as res2+        cls.res2 = res2+        cls.res1 = res+        cls.resp = resp+        cls.resf = resf+        cls.resu = resu++        # regression numbers+        cls.pred_table = np.array([[202,  18,   0, 220],+                                   [112,  28,   0, 140],+                                   [ 27,  13,   0,  40],  # noqa+                                   [341,  59,   0, 400]], dtype=np.int64)++    def test_loglikerelated(self):++        res1 = self.res1+        # res2 = self.res2++        mod = res1.model+        fact = 1.1  # evaluate away from optimum+        score1 = mod.score(res1.params * fact)+        score_obs_numdiff = mod.score_obs(res1.params * fact)+        score_obs_exog = mod.score_obs_(res1.params * fact)+        assert_allclose(score_obs_numdiff.sum(0), score1, atol=1e-7)+        assert_allclose(score_obs_exog.sum(0), score1[:mod.k_vars], atol=1e-7)++        # null model+        mod_null = OrderedModel(mod.endog, None,+                                offset=np.zeros(mod.nobs),+                                distr=mod.distr)+        null_params = mod.start_params+        res_null = mod_null.fit(method='bfgs', disp=False)+        assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8)++    def test_formula_categorical(self):++        resp = self.resp+        data = ds.df++        with warnings.catch_warnings():+            warnings.simplefilter("ignore", SpecificationWarning)+            formula = "apply ~ pared + public + gpa - 1"+            modf2 = OrderedModel.from_formula(formula,+                                              data, distr='probit')+        resf2 = modf2.fit(method='bfgs', disp=False)+        assert_allclose(resf2.params, resp.params, atol=1e-8)+        assert modf2.exog_names == resp.model.exog_names+        assert modf2.data.ynames == resp.model.data.ynames+        assert hasattr(modf2.data, "frame")+        assert not hasattr(modf2, "frame")++        with pytest.raises(ValueError):+            with warnings.catch_warnings():+                warnings.simplefilter("ignore", SpecificationWarning)+                OrderedModel.from_formula(+                    "apply ~ pared + public + gpa - 1",+                    data={"apply": np.asarray(data['apply']),+                          "pared": data['pared'],+                          "public": data['public'],+                          "gpa": data['gpa']},+                    distr='probit')+++class TestLogitModelFormula():++    @classmethod+    def setup_class(cls):+        warnings.simplefilter("ignore", SpecificationWarning)+        data = ds.df+        nobs = len(data)+        data["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float)+        # alias to correspond to patsy name+        data["C(dummy)[T.1.0]"] = data["dummy"]+        cls.data = data++        columns = ['C(dummy)[T.1.0]', 'pared', 'public', 'gpa']+        # standard fit+        mod = OrderedModel(data['apply'].values.codes,+                           np.asarray(data[columns], float),+                           distr='logit')+        cls.res = mod.fit(method='bfgs', disp=False)+        # standard fit with pandas input+        modp = OrderedModel(data['apply'],+                            data[columns],+                            distr='logit')+        cls.resp = modp.fit(method='bfgs', disp=False)++    def test_setup(self):+        data = self.data+        resp = self.resp+        fittedvalues = resp.predict()++        formulas = ["apply ~ 1 + pared + public + gpa + C(dummy)",+                    "apply ~ pared + public + gpa + C(dummy)"]+        for formula in formulas:+            modf1 = OrderedModel.from_formula(formula, data, distr='logit')+            resf1 = modf1.fit(method='bfgs')+            summf1 = resf1.summary()+            summf1_str = str(summf1)+            assert resf1.model.exog_names == resp.model.exog_names+            assert resf1.model.data.param_names == resp.model.exog_names+            assert all(name in summf1_str for name in+                       resp.model.data.param_names)+            assert_allclose(resf1.predict(data[:5]), fittedvalues[:5])++        # test over parameterized model with implicit constant+        # warns but doesn't raise+        formula = "apply ~ 0 + pared + public + gpa + C(dummy)"++        with pytest.warns(SpecificationWarning):+            modf2 = OrderedModel.from_formula(formula, data, distr='logit')++        with pytest.warns(HessianInversionWarning):+            resf2 = modf2.fit(method='bfgs')+            assert np.isnan(resf2.bse).all()`

I'm not sure whether we get the HessianInversionWarning either, current assert fails before that The optimization might stop at a parameter value that still has invertible Hessian

josef-pkt

comment created time in 20 hours

PullRequestReviewEvent

`+"""+Test  for ordinal models+"""++import numpy as np+import scipy.stats as stats+import pytest+import pandas as pd++from numpy.testing import assert_allclose, assert_equal+from statsmodels.tools.sm_exceptions import (+    HessianInversionWarning, SpecificationWarning)+from .results.results_ordinal_model import data_store as ds+from statsmodels.miscmodels.ordinal_model import OrderedModel++import warnings+warnings.simplefilter("ignore", SpecificationWarning)+++class CheckOrdinalModelMixin(object):++    def test_basic(self):+        # checks basic results againt R MASS package+        n_cat = ds.n_ordinal_cat+        res1 = self.res1+        res2 = self.res2+        # coefficients values, standard errors, t & p values+        assert_allclose(res1.params[:-n_cat + 1],+                        res2.coefficients_val, atol=2e-4)+        assert_allclose(res1.bse[:-n_cat + 1],+                        res2.coefficients_stdE, rtol=0.003, atol=1e-5)+        assert_allclose(res1.tvalues[:-n_cat + 1],+                        res2.coefficients_tval, rtol=0.003, atol=7e-4)+        assert_allclose(res1.pvalues[:-n_cat + 1],+                        res2.coefficients_pval, rtol=0.009, atol=1e-5)+        # thresholds are given with exponentiated increments+        # from the first threshold+        assert_allclose(+            res1.model.transform_threshold_params(res1.params)[1:-1],+            res2.thresholds, atol=4e-4)++        # probabilities+        assert_allclose(res1.predict()[:7, :],+                        res2.prob_pred, atol=5e-5)++    def test_pandas(self):+        # makes sure that the Pandas ecosystem is supported+        res1 = self.res1+        resp = self.resp+        # converges slightly differently why?+        assert_allclose(res1.params, resp.params, atol=1e-10)+        assert_allclose(res1.bse, resp.bse, atol=1e-10)++        assert_allclose(res1.model.endog, resp.model.endog, rtol=1e-10)+        assert_allclose(res1.model.exog, resp.model.exog, rtol=1e-10)++    def test_formula(self):+        # makes sure the "R-way" of writing models is supported+        res1 = self.res1+        resf = self.resf+        # converges slightly differently why? yet e-5 is ok+        assert_allclose(res1.params, resf.params, atol=5e-5)+        assert_allclose(res1.bse, resf.bse, atol=5e-5)++        assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10)+        assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10)++    def test_unordered(self):+        # makes sure that ordered = True is optional for the endog Serie+        # et categories have to be set in the right order+        res1 = self.res1+        resf = self.resu+        # converges slightly differently why?+        assert_allclose(res1.params, resf.params, atol=1e-10)+        assert_allclose(res1.bse, resf.bse, atol=1e-10)++        assert_allclose(res1.model.endog, resf.model.endog, rtol=1e-10)+        assert_allclose(res1.model.exog, resf.model.exog, rtol=1e-10)++    def test_results_other(self):++        res1 = self.res1  # numpy+        resp = self.resp  # pandas++        param_names_np = ['x1', 'x2', 'x3', '0/1', '1/2']+        param_names_pd = ['pared', 'public', 'gpa', 'unlikely/somewhat likely',+                          'somewhat likely/very likely']++        assert res1.model.data.param_names == param_names_np+        assert self.resp.model.data.param_names == param_names_pd+        assert self.resp.model.endog_names == "apply"++        # results+        if hasattr(self, "pred_table"):+            table = res1.pred_table()+            assert_equal(table.values, self.pred_table)++        # smoke test+        res1.summary()++        # inherited+        tt = res1.t_test(np.eye(len(res1.params)))+        assert_allclose(tt.pvalue, res1.pvalues, rtol=1e-13)++        tt = resp.t_test(['pared', 'public', 'gpa'])  # pandas names+        assert_allclose(tt.pvalue, res1.pvalues[:3], rtol=1e-13)++        pred = res1.predict(exog=res1.model.exog[-5:])+        fitted = res1.predict()+        assert_allclose(pred, fitted[-5:], rtol=1e-13)++        pred = resp.predict(exog=resp.model.data.orig_exog.iloc[-5:])+        fitted = resp.predict()+        assert_allclose(pred, fitted[-5:], rtol=1e-13)++        dataf = self.resf.model.data.frame  # is a dict+        dataf_df = pd.DataFrame.from_dict(dataf)+        pred = self.resf.predict(exog=dataf_df.iloc[-5:])+        fitted = self.resf.predict()+        assert_allclose(pred, fitted[-5:], rtol=1e-13)+++class TestLogitModel(CheckOrdinalModelMixin):++    @classmethod+    def setup_class(cls):+        data = ds.df+        data_unordered = ds.df_unordered++        # standard fit+        mod = OrderedModel(data['apply'].values.codes,+                           np.asarray(data[['pared', 'public', 'gpa']], float),+                           distr='logit')+        res = mod.fit(method='bfgs', disp=False)+        # standard fit with pandas input+        modp = OrderedModel(data['apply'],+                            data[['pared', 'public', 'gpa']],+                            distr='logit')+        resp = modp.fit(method='bfgs', disp=False)+        # fit with formula+        with warnings.catch_warnings():+            warnings.simplefilter("ignore", SpecificationWarning)+            modf = OrderedModel.from_formula(+                "apply ~ pared + public + gpa - 1",+                data={"apply": data['apply'].values.codes,+                      "pared": data['pared'],+                      "public": data['public'],+                      "gpa": data['gpa']},+                distr='logit')+        resf = modf.fit(method='bfgs', disp=False)+        # fit on data with ordered=False+        modu = OrderedModel(+            data_unordered['apply'].values.codes,+            np.asarray(data_unordered[['pared', 'public', 'gpa']], float),+            distr='logit')+        resu = modu.fit(method='bfgs', disp=False)++        from .results.results_ordinal_model import res_ord_logit as res2+        cls.res2 = res2+        cls.res1 = res+        cls.resp = resp+        cls.resf = resf+        cls.resu = resu+++class TestProbitModel(CheckOrdinalModelMixin):++    @classmethod+    def setup_class(cls):+        data = ds.df+        data_unordered = ds.df_unordered++        mod = OrderedModel(data['apply'].values.codes,+                           np.asarray(data[['pared', 'public', 'gpa']], float),+                           distr='probit')+        res = mod.fit(method='bfgs', disp=False)++        modp = OrderedModel(data['apply'],+                            data[['pared', 'public', 'gpa']],+                            distr='probit')+        resp = modp.fit(method='bfgs', disp=False)++        with warnings.catch_warnings():+            warnings.simplefilter("ignore", SpecificationWarning)+            modf = OrderedModel.from_formula(+                "apply ~ pared + public + gpa - 1",+                data={"apply": data['apply'].values.codes,+                      "pared": data['pared'],+                      "public": data['public'],+                      "gpa": data['gpa']},+                distr='probit')+        resf = modf.fit(method='bfgs', disp=False)++        modu = OrderedModel(+            data_unordered['apply'].values.codes,+            np.asarray(data_unordered[['pared', 'public', 'gpa']], float),+            distr='probit')+        resu = modu.fit(method='bfgs', disp=False)++        from .results.results_ordinal_model import res_ord_probit as res2+        cls.res2 = res2+        cls.res1 = res+        cls.resp = resp+        cls.resf = resf+        cls.resu = resu++        # regression numbers+        cls.pred_table = np.array([[202,  18,   0, 220],+                                   [112,  28,   0, 140],+                                   [ 27,  13,   0,  40],  # noqa+                                   [341,  59,   0, 400]], dtype=np.int64)++    def test_loglikerelated(self):++        res1 = self.res1+        # res2 = self.res2++        mod = res1.model+        fact = 1.1  # evaluate away from optimum+        score1 = mod.score(res1.params * fact)+        score_obs_numdiff = mod.score_obs(res1.params * fact)+        score_obs_exog = mod.score_obs_(res1.params * fact)+        assert_allclose(score_obs_numdiff.sum(0), score1, atol=1e-7)+        assert_allclose(score_obs_exog.sum(0), score1[:mod.k_vars], atol=1e-7)++        # null model+        mod_null = OrderedModel(mod.endog, None,+                                offset=np.zeros(mod.nobs),+                                distr=mod.distr)+        null_params = mod.start_params+        res_null = mod_null.fit(method='bfgs', disp=False)+        assert_allclose(res_null.params, null_params[mod.k_vars:], rtol=1e-8)++    def test_formula_categorical(self):++        resp = self.resp+        data = ds.df++        with warnings.catch_warnings():+            warnings.simplefilter("ignore", SpecificationWarning)+            formula = "apply ~ pared + public + gpa - 1"+            modf2 = OrderedModel.from_formula(formula,+                                              data, distr='probit')+        resf2 = modf2.fit(method='bfgs', disp=False)+        assert_allclose(resf2.params, resp.params, atol=1e-8)+        assert modf2.exog_names == resp.model.exog_names+        assert modf2.data.ynames == resp.model.data.ynames+        assert hasattr(modf2.data, "frame")+        assert not hasattr(modf2, "frame")++        with pytest.raises(ValueError):+            with warnings.catch_warnings():+                warnings.simplefilter("ignore", SpecificationWarning)+                OrderedModel.from_formula(+                    "apply ~ pared + public + gpa - 1",+                    data={"apply": np.asarray(data['apply']),+                          "pared": data['pared'],+                          "public": data['public'],+                          "gpa": data['gpa']},+                    distr='probit')+++class TestLogitModelFormula():++    @classmethod+    def setup_class(cls):+        warnings.simplefilter("ignore", SpecificationWarning)+        data = ds.df+        nobs = len(data)+        data["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float)+        # alias to correspond to patsy name+        data["C(dummy)[T.1.0]"] = data["dummy"]+        cls.data = data++        columns = ['C(dummy)[T.1.0]', 'pared', 'public', 'gpa']+        # standard fit+        mod = OrderedModel(data['apply'].values.codes,+                           np.asarray(data[columns], float),+                           distr='logit')+        cls.res = mod.fit(method='bfgs', disp=False)+        # standard fit with pandas input+        modp = OrderedModel(data['apply'],+                            data[columns],+                            distr='logit')+        cls.resp = modp.fit(method='bfgs', disp=False)++    def test_setup(self):+        data = self.data+        resp = self.resp+        fittedvalues = resp.predict()++        formulas = ["apply ~ 1 + pared + public + gpa + C(dummy)",+                    "apply ~ pared + public + gpa + C(dummy)"]+        for formula in formulas:+            modf1 = OrderedModel.from_formula(formula, data, distr='logit')+            resf1 = modf1.fit(method='bfgs')+            summf1 = resf1.summary()+            summf1_str = str(summf1)+            assert resf1.model.exog_names == resp.model.exog_names+            assert resf1.model.data.param_names == resp.model.exog_names+            assert all(name in summf1_str for name in+                       resp.model.data.param_names)+            assert_allclose(resf1.predict(data[:5]), fittedvalues[:5])++        # test over parameterized model with implicit constant+        # warns but doesn't raise+        formula = "apply ~ 0 + pared + public + gpa + C(dummy)"++        with pytest.warns(SpecificationWarning):+            modf2 = OrderedModel.from_formula(formula, data, distr='logit')++        with pytest.warns(HessianInversionWarning):+            resf2 = modf2.fit(method='bfgs')+            assert np.isnan(resf2.bse).all()`

this assert fails on several, not all Windows environments. It has finite values in those cases.

I have nans on my computer in this case, but I had other cases with finite values, although huge bse for unidentified parameters

josef-pkt

comment created time in 20 hours

PullRequestReviewEvent

duplicate of #4282

but we leave it open to fix this soon

brl0

comment created time in a day

push eventjosef-pkt/statsmodels

commit sha 3f377e43e660e7d52e9ae80aab9b3d75e33d5847

REF: use SpecificationWarning for formula,

push time in a day

issue openedstatsmodels/statsmodels

I converted UserWarnings to SpecificationWarning, and unit tests are now failing with the warning

the warning filter is set to "error"

I guess the setting is changed in setup.cfg

``````    error::statsmodels.tools.sm_exceptions.HypothesisTestWarning
error::statsmodels.tools.sm_exceptions.SpecificationWarning
``````

created time in a day

push eventjosef-pkt/statsmodels

commit sha db637720f06b99c3f21721843a942cd90bda3be6

ENH: drop Intercept in `from_formula`, similar to MixedLM, fix predict design_info

push time in a day

push eventstatsmodels/statsmodels.github.io

commit sha c615506141c01c797f7d5ed3641fee997089ddb9

Update docs after building Travis build 12379 of statsmodels/statsmodels The docs were built from the branch 'master' against the commit df1c1a1bd17bfbab7c61bd903f06362a7bb5dc0b. The Travis build that generated this commit is at https://travis-ci.org/statsmodels/statsmodels/jobs/728524060. The doctr command that was run is /home/travis/miniconda/envs/statsmodels-test/bin/doctr deploy --built-docs docs/build/html/ --deploy-repo statsmodels/statsmodels.github.io devel

push time in 2 days

push eventjosef-pkt/statsmodels

commit sha 00933bfba0a58406009bb73015e38cfd273627ce

ENH: drop Intercept in `from_formula`, similar to MixedLM

push time in 2 days

another strange thing (after fixing design_info in predict)

`predict()`, without exog, returns numpy array and not DataFrame or Series. Is this intentional? maybe for internal usage, e.g. to get fittedvalues, resid, ...? Returning pandas in `Results.predict` is not in the wrapper, so we also get internally the same as the user

``````resf_logit.predict()
array([[0.54884071, 0.35932276, 0.09183653],
[0.30558191, 0.47594216, 0.21847593],
[0.22938356, 0.47819057, 0.29242587],
...,
[0.69380357, 0.25470075, 0.05149568],
[0.54884071, 0.35932276, 0.09183653],
[0.50896794, 0.38494062, 0.10609145]])

print(resf_logit.predict(data_student.iloc[:5]))
0         1         2
0  0.548841  0.359323  0.091837
1  0.305582  0.475942  0.218476
2  0.229384  0.478191  0.292426
3  0.616118  0.312690  0.071191
4  0.656003  0.283398  0.060599
``````
josef-pkt

comment created time in 2 days

Note `getattr(resf_logit.model, "design_info", resf_logit.model.data.design_info)` doesn't work if data.design_info doesn't exist. Python evaluates the default even if the first attribute exists.

josef-pkt

comment created time in 2 days

issue openedstatsmodels/statsmodels

https://github.com/statsmodels/statsmodels/pull/7035#issuecomment-694970684

needs to be investigated.

removing intercept attaches the reduced design_info to the model but not `model.data`. `base.Results.predict` uses `model.data.design_info`

Can we have cases where model.design_info and model.data.design_info differ?

I will be changing `base.Results.predict` to allow both, but there should be a clear rule `design_info = getattr(self.model, "design_info", self.model.data.design_info)`

created time in 2 days

more bugs, predict doesn't work with formula, missing `design_info`

``````resf_logit.predict()
array([[0.54884071, 0.35932276, 0.09183653],
[0.30558191, 0.47594216, 0.21847593],
[0.22938356, 0.47819057, 0.29242587],
...,
[0.69380357, 0.25470075, 0.05149568],
[0.54884071, 0.35932276, 0.09183653],
[0.50896794, 0.38494062, 0.10609145]])
resf_logit.predict(data_student.iloc[:5])
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-28-e85ceacdab72> in <module>
----> 1 resf_logit.predict(data_student.iloc[:5])

m:\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\base\model.py in predict(self, exog, transform, *args, **kwargs)
1061
1062         if transform and hasattr(self.model, 'formula') and (exog is not None):
-> 1063             design_info = self.model.data.design_info
1064             from patsy import dmatrix
1065             if isinstance(exog, pd.Series):

AttributeError: 'PandasData' object has no attribute 'design_info'
``````
josef-pkt

comment created time in 2 days

warning when it is not necessary

If there is no categorical variable in the model, then "0 +" would remove the intercept already correctly.

Not sure whether and how to fix this. At the location where the warning is issued we don't know yet whether there are categorical variables. It might also apply to splines (I haven't checked yet. Possible, change warning message to mention that this applies to the case with categorical variables.

``````OrderedModel.from_formula("apply_codes ~ 0 + pared + public + gpa", data_student,
distr='logit').fit().summary()
m:\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\miscmodels\ordinal_model.py:181: UserWarning: OrderedModel formulas should not include any '0' or '1' terms
warnings.warn(msg)
Optimization terminated successfully.
Current function value: 0.896281
Iterations: 421
Function evaluations: 663
``````
josef-pkt

comment created time in 2 days

issue openedstatsmodels/statsmodels

Some models cannot include a constant among regressors, e.g. PHReg or OrderedModel in the current parameterization.

Patsy always includes an explicit or implicit constant in exog when categorical variables are included. We have a workaround to drop an explicit constant and get the reduced design_info.

https://github.com/statsmodels/statsmodels/pull/3095 https://github.com/statsmodels/statsmodels/pull/7035#issuecomment-694452070 starting comment for adding support for dropping explicit intercept in a new model

created time in 2 days

With categorical interactions that have empty cells, patsy leaves a column of zeros in the exog.

sergiolevin

comment created time in 2 days

Kerby had figured out how to get the `design_info` after dropping a term or terms

https://github.com/statsmodels/statsmodels/pull/3095/files#diff-b165b4bd4e10edd0dfb485e47c562b2cR157 `design_info = design_info.builder.subset(cols).design_info`

josef-pkt

comment created time in 2 days

josef-pkt

comment created time in 3 days

possible workaround

the following works, but we still have wrong design_info. we would need to remove first term

``````modfd_logit = OrderedModel.from_formula("apply ~ 1 + pared + public + gpa + C(dummy)", data_student,
distr='logit')
modfd_logit.exog = modfd_logit.exog[:, 1:]
modfd_logit.k_vars -= 1
del modfd_logit.data.xnames[0]
resfd_logit = modfd_logit.fit(method='bfgs')
print(resfd_logit.summary())
``````
josef-pkt

comment created time in 3 days

no constant with patsy doesn't work if there is a categorical, i.e. we get both dummies

``````nobs = len(data_student)
data_student["dummy"] = (np.arange(nobs) < (nobs / 2)).astype(float)

modfd_logit = OrderedModel.from_formula("apply ~ 0 + pared + public + gpa + C(dummy)", data_student,
distr='logit')
resfd_logit = modfd_logit.fit(method='bfgs')
print(resfd_logit.summary())
Optimization terminated successfully.
Current function value: 0.896247
Iterations: 24
Function evaluations: 26
OrderedModel Results
==============================================================================
Dep. Variable:                  apply   Log-Likelihood:                -358.50
Model:                   OrderedModel   AIC:                             727.0
Method:            Maximum Likelihood   BIC:                             747.0
Date:                Thu, 17 Sep 2020
Time:                        15:27:59
No. Observations:                 400
Df Residuals:                     395
Df Model:                           4
===============================================================================================
coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
C(dummy)[0.0]                  -0.6834        nan        nan        nan         nan         nan
C(dummy)[1.0]                  -0.6508        nan        nan        nan         nan         nan
pared                           1.0489        nan        nan        nan         nan         nan
public                         -0.0588        nan        nan        nan         nan         nan
gpa                             0.6153        nan        nan        nan         nan         nan
unlikely/somewhat likely        1.5349        nan        nan        nan         nan         nan
somewhat likely/very likely     0.7398        nan        nan        nan         nan         nan
===============================================================================================
...\statsmodels\statsmodels\base\model.py:548: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
'available', HessianInversionWarning)
``````
josef-pkt

comment created time in 3 days

`+# -*- coding: utf-8 -*-+"""+Created on Sat Aug 22 20:24:42 2015++Author: Josef Perktold+License: BSD-3+"""++import numpy as np+import pandas as pd+from pandas.api.types import CategoricalDtype+from scipy import stats+from statsmodels.base.model import (+    GenericLikelihoodModel, GenericLikelihoodModelResults)+from statsmodels.compat.pandas import Appender+++class OrderedModel(GenericLikelihoodModel):+    """Ordinal Model based on logistic or normal distribution++    The parameterization corresponds to the proportional odds model.++    The mode assumes that the endogenous variable is ordered but that the+    labels have no numeric interpretation besides the ordering.++    The model is based on a latent linear variable, where we observe only a+    discretization.++    y_latent = X beta + u++    The observed variable is defined by the interval++    y = {0 if y_latent <= cut_0+         1 of cut_0 < y_latent <= cut_1+         ...+         K if cut_K < y_latent++    The probability of observing y=k conditional on the explanatory variables+    X is given by++    prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1)+                    = Prob(cut_k - x beta < u <= cut_k+1 - x beta+                    = F(cut_k+1 - x beta) - F(cut_k - x beta)++    Where F is the cumulative distribution of u which is either the normal+    or the logistic distribution, but can be set to any other continuous+    distribution. We use standardized distributions to avoid identifiability+    problems.++    Parameters+    ----------+    endog : array_like+        Endogenous or dependent ordered categorical variable with k levels.+        Labels or values of endog will internally transformed to consecutive+        integers, 0, 1, 2, ...+        pd.Series with Categorical as dtype should be preferred as it gives+        the order relation between the levels.+        If endog is not a pandas Categorical, then categories are+        sorted in lexicographic order (by numpy.unique).+    exog : array_like+        Exogenous, explanatory variables. This should not include an intercept.+        pd.DataFrame are also accepted.+    distr : string 'probit' or 'logit', or a distribution instance+        The default is currently 'probit' which uses the normal distribution+        and corresponds to an ordered Probit model. The distribution is+        assumed to have the main methods of scipy.stats distributions, mainly+        cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate+        starting values.++    Status: initial version, subclasses `GenericLikelihoodModel`++    """+    _formula_max_endog = np.inf++    def __init__(self, endog, exog, offset=None, distr='probit', **kwds):++        if distr == 'probit':+            self.distr = stats.norm+        elif distr == 'logit':+            self.distr = stats.logistic+        else:+            self.distr = distr++        if offset is not None:+            offset = np.asarray(offset)++        self.offset = offset++        endog, labels, is_pandas = self._check_inputs(endog, exog)++        super(OrderedModel, self).__init__(endog, exog, **kwds)++        if not is_pandas:+            if self.endog.ndim == 1:+                unique, index = np.unique(self.endog, return_inverse=True)+                self.endog = index+                labels = unique+            elif self.endog.ndim == 2:+                if not hasattr(self, "design_info"):+                    raise ValueError("2-dim endog not supported")+                # this branch is currently only in support of from_formula+                # labels here are only needed to choose k_levels in initialize+                labels = [str(i) for i in range(self.endog.shape[1])]+                labels = []+                # Note: Doing the following here would break from_formula+                # self.endog = self.endog.argmax(1)++        self._initialize_labels(labels)++        self.results_class = OrderedResults++    def _check_inputs(self, endog, exog):+        """handle endog that is pandas Categorical++        checks if self.distrib is legal and does the Pandas Categorical+        support for endog.+        """++        if not isinstance(self.distr, stats.rv_continuous):+            import warnings+            msg = (+                f"{self.distr.name} is not a scipy.stats distribution."+            )+            raise warnings.warn(msg)++        labels = None+        is_pandas = False+        if isinstance(endog, pd.Series):+            if isinstance(endog.dtypes, CategoricalDtype):+                if not endog.dtype.ordered:+                    warnings.warn("the endog has ordered == False, "+                                  "risk of capturing a wrong order for the "+                                  "categories. ordered == True preferred.",+                                  Warning)++                endog_name = endog.name+                labels = endog.values.categories+                endog = endog.cat.codes+                if endog.min() == -1:  # means there is a missing value+                    raise ValueError("missing values in categorical endog are "+                                     "not supported")+                endog.name = endog_name+                is_pandas = True++        return endog, labels, is_pandas++    def _initialize_labels(self, labels):+        self.labels = labels+        self.k_levels = len(labels)++        if self.exog is not None:+            self.nobs, self.k_vars = self.exog.shape+        else:  # no exog in model+            self.nobs, self.k_vars = self.endog.shape[0], 0++        threshold_names = [str(x) + '/' + str(y)+                           for x, y in zip(labels[:-1], labels[1:])]++        # from GenericLikelihoodModel.fit+        if self.exog is not None:+            # avoid extending several times+            if len(self.exog_names) > self.k_vars:+                raise RuntimeError("something wrong with exog_names, too long")+            self.exog_names.extend(threshold_names)+        else:+            self.data.xnames = threshold_names++    @classmethod+    def from_formula(cls, formula, data, subset=None, drop_cols=None,+                     *args, **kwargs):++        if "0+" not in formula.replace(" ", ""):+            import warnings+            warnings.warn("Ordinal models should not include an intercept")++        endog_name = formula.split("~")[0].strip()+        original_endog = data[endog_name]++        model = super(OrderedModel, cls).from_formula(+            formula, data=data, *args, **kwargs)++        if model.endog.ndim == 2:+            if not (isinstance(original_endog.dtype, CategoricalDtype)+                    and original_endog.dtype.ordered):+                msg = ("Only ordered pandas Categorical are supported as "+                       "endog in formulas")+                raise ValueError(msg)++            labels = original_endog.values.categories+            model._initialize_labels(labels)+            model.endog = model.endog.argmax(1)+            model.data.ynames = endog_name++        return model++    def cdf(self, x):+        """cdf evaluated at x+        """+        return self.distr.cdf(x)++    def pdf(self, x):+        """pdf evaluated at x+        """+        return self.distr.pdf(x)++    def prob(self, low, upp):+        """interval probability+        """+        return np.maximum(self.cdf(upp) - self.cdf(low), 0)++    def transform_threshold_params(self, params):+        """transformation of the parameters in the optimization++        Parameters+        ----------+        params : nd_array+            contains (exog_coef, transformed_thresholds) where exog_coef are+            the coefficient for the explanatory variables in the linear term,+            transformed threshold or cutoff points. The first, lowest threshold+            is unchanged, all other thresholds are in terms of exponentiated+            increments++        Returns+        -------+        thresh : nd_array+            thresh are the thresholds or cutoff constants for the intervals.++        """+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        return thresh++    def transform_reverse_threshold_params(self, params):+        """obtain transformed thresholds from original thresholds, cutoff+        constants.++        """+        start_ppf = params+        thresh_params = np.concatenate((start_ppf[:1],+                                        np.log(np.diff(start_ppf[:-1]))))+        return thresh_params++    def predict(self, params, exog=None):+        """predicted probabilities for each level of the ordinal endog.+++        """+        if exog is None:+            exog = self.exog+        # structure of params = [beta, constants_or_thresholds]++        # explicit in several steps to avoid bugs+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        xb = exog.dot(params[:-(self.k_levels - 1)])[:, None]+        low = thresh[:-1] - xb`

BUG: missing offset and it should delegate to _linpred

also option for `what` to return, e.g. linear, cdf, ...

josef-pkt

comment created time in 4 days

PullRequestReviewEvent

all green, I have all the minimum parts that I wanted.

• maybe change name to OrdinalModel instead of OrderedModel.
• currently remains as subclasses of genericlikelihood
• post-estimation, diagnostics: I haven't looked much yet for those
• extensions,
• with offset implemented, it would be relatively easy to add `fit_constrained`
• I have not yet looked at `margins` for it.

still ambiguous: how can we ascertain that we have no constant when using formulas? i.e. get correct patsy behavior

I'm still not sure which parameterization of constant versus thresholds would be better. Green argues that if the parameterization includes a constant, then the model reduces to standard Logit or Probit in the binary case.

unit test for binary case are missing.

josef-pkt

comment created time in 4 days

`+# -*- coding: utf-8 -*-+"""+Created on Sat Aug 22 20:24:42 2015++Author: Josef Perktold+License: BSD-3+"""++import numpy as np+import pandas as pd+from pandas.api.types import CategoricalDtype+from scipy import stats+from statsmodels.base.model import (+    GenericLikelihoodModel, GenericLikelihoodModelResults)+from statsmodels.compat.pandas import Appender+++class OrderedModel(GenericLikelihoodModel):+    """Ordinal Model based on logistic or normal distribution++    The parameterization corresponds to the proportional odds model.++    The mode assumes that the endogenous variable is ordered but that the+    labels have no numeric interpretation besides the ordering.++    The model is based on a latent linear variable, where we observe only a+    discretization.++    y_latent = X beta + u++    The observed variable is defined by the interval++    y = {0 if y_latent <= cut_0+         1 of cut_0 < y_latent <= cut_1+         ...+         K if cut_K < y_latent++    The probability of observing y=k conditional on the explanatory variables+    X is given by++    prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1)+                    = Prob(cut_k - x beta < u <= cut_k+1 - x beta+                    = F(cut_k+1 - x beta) - F(cut_k - x beta)++    Where F is the cumulative distribution of u which is either the normal+    or the logistic distribution, but can be set to any other continuous+    distribution. We use standardized distributions to avoid identifiability+    problems.++    Parameters+    ----------+    endog : array_like+        Endogenous or dependent ordered categorical variable with k levels.+        Labels or values of endog will internally transformed to consecutive+        integers, 0, 1, 2, ...+        pd.Series with Categorical as dtype should be preferred as it gives+        the order relation between the levels.+        If endog is not a pandas Categorical, then categories are+        sorted in lexicographic order (by numpy.unique).+    exog : array_like+        Exogenous, explanatory variables. This should not include an intercept.+        pd.DataFrame are also accepted.+    distr : string 'probit' or 'logit', or a distribution instance+        The default is currently 'probit' which uses the normal distribution+        and corresponds to an ordered Probit model. The distribution is+        assumed to have the main methods of scipy.stats distributions, mainly+        cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate+        starting values.++    Status: initial version, subclasses `GenericLikelihoodModel`++    """+    _formula_max_endog = np.inf++    def __init__(self, endog, exog, offset=None, distr='probit', **kwds):++        if distr == 'probit':+            self.distr = stats.norm+        elif distr == 'logit':+            self.distr = stats.logistic+        else:+            self.distr = distr++        if offset is not None:+            offset = np.asarray(offset)++        self.offset = offset++        endog, labels, is_pandas = self._check_inputs(endog, exog)++        super(OrderedModel, self).__init__(endog, exog, **kwds)++        if not is_pandas:+            if self.endog.ndim == 1:+                unique, index = np.unique(self.endog, return_inverse=True)+                self.endog = index+                labels = unique+            elif self.endog.ndim == 2:+                if not hasattr(self, "design_info"):+                    raise ValueError("2-dim endog not supported")+                # this branch is currently only in support of from_formula+                # labels here are only needed to choose k_levels in initialize+                labels = [str(i) for i in range(self.endog.shape[1])]+                labels = []+                # Note: Doing the following here would break from_formula+                # self.endog = self.endog.argmax(1)++        self._initialize_labels(labels)++        self.results_class = OrderedResults++    def _check_inputs(self, endog, exog):+        """handle endog that is pandas Categorical++        checks if self.distrib is legal and does the Pandas Categorical+        support for endog.+        """++        if not isinstance(self.distr, stats.rv_continuous):+            import warnings+            msg = (+                f"{self.distr.name} is not a scipy.stats distribution."+            )+            raise warnings.warn(msg)++        labels = None+        is_pandas = False+        if isinstance(endog, pd.Series):+            if isinstance(endog.dtypes, CategoricalDtype):+                if not endog.dtype.ordered:+                    warnings.warn("the endog has ordered == False, "+                                  "risk of capturing a wrong order for the "+                                  "categories. ordered == True preferred.",+                                  Warning)++                endog_name = endog.name+                labels = endog.values.categories+                endog = endog.cat.codes+                if endog.min() == -1:  # means there is a missing value+                    raise ValueError("missing values in categorical endog are "+                                     "not supported")+                endog.name = endog_name+                is_pandas = True++        return endog, labels, is_pandas++    def _initialize_labels(self, labels):+        self.labels = labels+        self.k_levels = len(labels)++        if self.exog is not None:+            self.nobs, self.k_vars = self.exog.shape+        else:  # no exog in model+            self.nobs, self.k_vars = self.endog.shape[0], 0++        threshold_names = [str(x) + '/' + str(y)+                           for x, y in zip(labels[:-1], labels[1:])]++        # from GenericLikelihoodModel.fit+        if self.exog is not None:+            # avoid extending several times+            if len(self.exog_names) > self.k_vars:+                raise RuntimeError("something wrong with exog_names, too long")+            self.exog_names.extend(threshold_names)+        else:+            self.data.xnames = threshold_names++    @classmethod+    def from_formula(cls, formula, data, subset=None, drop_cols=None,+                     *args, **kwargs):++        if "0+" not in formula.replace(" ", ""):+            import warnings+            warnings.warn("Ordinal models should not include an intercept")++        endog_name = formula.split("~")[0].strip()+        original_endog = data[endog_name]++        model = super(OrderedModel, cls).from_formula(+            formula, data=data, *args, **kwargs)++        if model.endog.ndim == 2:+            if not (isinstance(original_endog.dtype, CategoricalDtype)+                    and original_endog.dtype.ordered):+                msg = ("Only ordered pandas Categorical are supported as "+                       "endog in formulas")+                raise ValueError(msg)++            labels = original_endog.values.categories+            model._initialize_labels(labels)+            model.endog = model.endog.argmax(1)+            model.data.ynames = endog_name++        return model++    def cdf(self, x):+        """cdf evaluated at x+        """+        return self.distr.cdf(x)++    def pdf(self, x):+        """pdf evaluated at x+        """+        return self.distr.pdf(x)++    def prob(self, low, upp):+        """interval probability+        """+        return np.maximum(self.cdf(upp) - self.cdf(low), 0)++    def transform_threshold_params(self, params):+        """transformation of the parameters in the optimization++        Parameters+        ----------+        params : nd_array+            contains (exog_coef, transformed_thresholds) where exog_coef are+            the coefficient for the explanatory variables in the linear term,+            transformed threshold or cutoff points. The first, lowest threshold+            is unchanged, all other thresholds are in terms of exponentiated+            increments++        Returns+        -------+        thresh : nd_array+            thresh are the thresholds or cutoff constants for the intervals.++        """+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        return thresh++    def transform_reverse_threshold_params(self, params):+        """obtain transformed thresholds from original thresholds, cutoff+        constants.++        """+        start_ppf = params+        thresh_params = np.concatenate((start_ppf[:1],+                                        np.log(np.diff(start_ppf[:-1]))))+        return thresh_params++    def predict(self, params, exog=None):+        """predicted probabilities for each level of the ordinal endog.+++        """+        if exog is None:+            exog = self.exog+        # structure of params = [beta, constants_or_thresholds]++        # explicit in several steps to avoid bugs+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        xb = exog.dot(params[:-(self.k_levels - 1)])[:, None]+        low = thresh[:-1] - xb+        upp = thresh[1:] - xb+        prob = self.prob(low, upp)+        return prob++    def _linpred(self, params, exog=None, offset=None):+        """linear prediction of latent variable `x b`++        currently only for exog from estimation sample (in-sample)+        """+        if exog is None:+            exog = self.exog+        if offset is None:+            offset = self.offset`

this is incorrect if exog is not None but offset is None. that should raise error about shape mismatch of exog and self.offset

no unit test coverage for case when those are Nont.

josef-pkt

comment created time in 4 days

PullRequestReviewEvent

push eventjosef-pkt/statsmodels

commit sha c1664d989c4eea864180e69b200bef687494a234

CLN: cleanup, remove now redundant code and comments for formulas

push time in 4 days

that worked, only failures are with lint check.

The advantage is that I keep the information now inside `from_formula` the hackish part is that I need to fix up the created instance and change the model outside `__init__`. Also, I guess missing handling works, but I don't have unit tests for that. This could be refactored if we get #7033.

josef-pkt

comment created time in 5 days

push eventjosef-pkt/statsmodels

commit sha 0241fd52a74d68ef9a818c5ef475f7ae9f49c8fb

fix formula for pandas categorical

push time in 5 days

I continue with ordinal model in #7035 I leave this PR unchanged, so I can look at it again for adding `formula_tools`

josef-pkt

comment created time in 5 days

PR opened statsmodels/statsmodels

ENH: Ordinal model rebased2 comp-discrete type-enh

continuation of #7021 without adding `frame` as `Model.__init__` kwd

without last commit of #7021, and amended second to last commit to remove changes to `base.Model`

+1532 -0

0 comment

7 changed files

pr created time in 5 days

create barnchjosef-pkt/statsmodels

created branch time in 5 days

push eventjosef-pkt/statsmodels

commit sha 7e64c9881c291e88338701e54d34b1969280abc2

fix `frame` as `Model.__init__` keyword

push time in 5 days

I'm giving up here, adding `frame` as keyword to `Model.__init__` affects too many parts. This needs more systematic refactoring #7033, and more unit tests for unpickling which might be broken in some cases #7034

needs checking where "frame" is used and what types of it are allowed

josef-pkt

comment created time in 5 days

issue openedstatsmodels/statsmodels

My guess is that we are missing unit tests for pickling of models and results from different fit_xxx, with or without formula.

maybe bugs if `model.data.frame` is None https://github.com/statsmodels/statsmodels/pull/7021#issuecomment-692750497

created time in 5 days

`frame` can be None, in constrained estimation (Poisson, GLM) and in conditional Poisson and in mediation https://travis-ci.org/github/statsmodels/statsmodels/jobs/727232933

`frame` is needed for unpickling with formulas, so there might be problems in those cases, unless it is an auxiliary model that doesn't need to be pickled (as inconstrained), but that's unlikely.

josef-pkt

comment created time in 5 days

locally, unit tests for base, regression and miscmodels now pass

This is still a version with an explicit `frame` keyword from `from_formula` to `model.__init__`

josef-pkt

comment created time in 6 days

push eventjosef-pkt/statsmodels

commit sha f63c2435cbcb4a52463c9d482f2f654de2a3a4c2

fix `frame` as `Model.__init__` keyword

push time in 6 days

issue openedstatsmodels/statsmodels

I currently need some extra info in the model in how the formula handled the data (specifically endog) see last part in comment https://github.com/statsmodels/statsmodels/pull/7021#issuecomment-692267412

related: We also want additional formula_info for other reasons and cases

#2824 #5342 #5469

A formula_info keyword can replace the current explicit keywords `["missing_idx", "design_info", "formula"]`, and replace attributes like `frame` that are attached in from_formula to the created model instance.

In #7021, I still have problems adding a new `frame` keyword, but correctly implementing `formula_info` would allow that we add additional info without problems to the `formula_info` dict.

created time in 6 days

fixing the allowed kwargs is easy, but

it looks like there is a more tricky problem with missing value handling, several test failures for that. `frame` is an extra array in `__initi__` in my current code, which seems to get automatically included in nan handling I can work around that by wrapping it into something that is not considered as data array, e.g. `'frame': np.array({"data": data})` (needs ndim, dict raises error)

next problem: pickling (all other unit tests in `base` pass

``````..\..\\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py . [ 87%]
.........................F.FF.FF.FF.FF.F...                              [ 97%]
``````

failing test are all in formula handling in pickling

``````..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula::test_remove_data_pickle FAI
LED [ 61%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula::test_remove_data_docstring
PASSED [ 63%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula::test_pickle_wrapper FAILED
[ 65%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula2::test_remove_data_pickle FA
ILED [ 68%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula2::test_remove_data_docstring
PASSED [ 70%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula2::test_pickle_wrapper FAILED
[ 72%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula3::test_remove_data_pickle FA
ILED [ 75%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula3::test_remove_data_docstring
PASSED [ 77%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula3::test_pickle_wrapper FAILED
[ 79%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula4::test_remove_data_pickle FA
ILED [ 81%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula4::test_remove_data_docstring
PASSED [ 84%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula4::test_pickle_wrapper FAILED
[ 86%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula5::test_remove_data_pickle FA
ILED [ 88%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula5::test_remove_data_docstring
PASSED [ 90%]
..\..\josef_new\eclipse_ws\statsmodels\statsmodels_py37\statsmodels\statsmodels\
base\tests\test_shrink_pickle.py::TestPickleFormula5::test_pickle_wrapper FAILED
[ 93%]
``````

Might be better to add some `internal` or `formula_info` keyword to `model.__init__` instead of another specific keyword

eg. in MixedLM, with my addition of `frame`

`_allowed_kwargs = ["missing_idx", "design_info", "formula", "frame"]`

those are added in `from_formula`

``````        kwargs.update({'missing_idx': missing_idx,
'missing': missing,
'formula': formula,  # attach formula for unpckling
'design_info': design_info,
'frame': np.array({"data": data})  # try dict to avoid missing
})
``````

I think that should work, if we have a `formula_info` keyword, then eventually we might be able to reuse it for user provided formula info. e.g. user calls patsy design_matrix and gives us the design_info, or formula info based on other than patsy's formula handling.

josef-pkt

comment created time in 6 days

issue closedstatsmodels/statsmodels

#### Describe the bug

I'm fitting data using `statsmodels.api.NegativeBinomial`. When attempting to get the marginal effects from the results using `results.get_margeff(at='all', method='eyex')`, Statsmodels raises an IndexError, I think due to the mismatch between the exogenous variables matrix and `effects_idx`.

#### Code Sample, a copy-pastable example if possible

``````import pandas as pd
import numpy as np
import statsmodels.api as sm

high = 5.
n_obs = 5000
alpha = 6.5
const = 0.

np.random.seed(42)

betas = pd.Series([0.62, -1.71], index=['aa', 'bb'])

df_exog = pd.DataFrame(np.random.uniform(high=high, size=(n_obs, 2)),
columns=betas.keys())
lambda_expected = np.exp(df_exog.dot(betas) + const)
gamma_draws = np.random.gamma(1. / alpha, alpha, size=n_obs)
endog = np.random.poisson(lam=(lambda_expected * gamma_draws))

model = sm.NegativeBinomial(
results = model.fit(maxiter=100, disp=False)
#  This raises the error.
results.get_margeff(at='all', method='eyex')
``````

#### Expected Output

An array of marginal effects for all exogenous variables at each observation.

#### Output of `import statsmodels.api as sm; sm.show_versions()`

I get an IndexError

``````IndexError: boolean index did not match indexed array along dimension 1; dimension is 3 but corresponding boolean dimension is 4
``````

In line 728 of discrete_margins.py it looks like `effects_idx` is corrected for this dimension mismatch, but only when `at` is not equal to `'all'`.

From `import statsmodels.api as sm; sm.show_versions()` :

``````
INSTALLED VERSIONS
------------------
Python: 3.6.9.final.0
OS: Linux 4.4.0-18362-Microsoft #836-Microsoft Mon May 05 16:04:00 PST 2020 x86_64
byteorder: little
LC_ALL: None
LANG: C.UTF-8

statsmodels
===========

Installed: 0.11.1 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/statsmodels)

Required Dependencies
=====================

cython: 0.29.15 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/Cython)
numpy: 1.18.4 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/numpy)
scipy: 1.4.1 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/scipy)
pandas: 1.0.3 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/pandas)
dateutil: 2.8.1 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/dateutil)
patsy: 0.5.1 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/patsy)

Optional Dependencies
=====================

matplotlib: 3.2.0 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/matplotlib)
backend: module://ipykernel.pylab.backend_inline
cvxopt: Not installed
joblib: 0.15.1 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/joblib)

Developer Tools
================

IPython: 7.14.0 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/IPython)
jinja2: 2.11.2 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/jinja2)
sphinx: Not installed
pygments: 2.6.1 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/pygments)
pytest: 5.4.1 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/pytest)
virtualenv: 20.0.5 (/home/czhu/pythonenv/bditto/lib/python3.6/site-packages/virtualenv)
``````

I've also tried building statsmodels v0.12.0.dev0+326.g856385ac1 from source and running the code snippet, and find the same error.

closed time in 6 days

cczhu

quickfix merged in #6939 for 0.12

cczhu

comment created time in 6 days

#6939 fixed problem with extra params in negbin, #6763

still needs unit tests and more review of margins for models with extra params, negbin, NBP, GBP (IIRC, margins are not available for zero inflated count models)

josef-pkt

comment created time in 6 days

issue closedstatsmodels/statsmodels

hypothesis tests in recently merged PRs for rates and proportions have plain tuple return for some functions, e.g. tost_xxx

Should return a HolderTuple by default, change before release so we don't need to break backwards compatibility

closed time in 6 days

josef-pkt

AFAIR, I fixed those before the release of 0.12

josef-pkt

comment created time in 6 days

see #2163

I prefer `xxx_kwds` which sound nicer to me than `kwargs`

in GMM I used `optim_args`, robust sandwiches use `cov_kwds`

mknz

comment created time in 6 days

The problem in fit is that remaining kwargs go into the optimizer and those keywords are not standardized

I just ran the above example. It looks like both spellings work correctly with default optimizers.

`Poisson(np.random.random((10, 1)), np.random.random((10, 2))).fit(max_iter=4, method="bfgs")` this doesn't limit the number of iterations, and converges in more steps.

What I didn't understand is that kwargs don't raise in the scipy optimizers if we hand all remaining kwargs to them. I didn't remember this correctly. AFAICS, the base.optimizer functions get the kwargs, but use only explicit keywords in the call to the scipy optimizers. All other kwargs are ignored, if there are any extras not valid for the optimizer.

The list of specific optimizer kwds is only available in the `_fit_xxx` functions in base.optimize.

Kerby used an allowed list in MixedLM (IIRC) for `model.__init__` where we have a similar kwargs problem.

mknz

comment created time in 7 days

issue openedstatsmodels/statsmodels

There are several errors and formatting problems (e.g. line breaks, lists)

e.g. https://www.statsmodels.org/dev/generated/statsmodels.stats.rates.tost_poisson_2indep.html Parameters, data for second sample copied "first", same for `test_poisson_2indep` Return has changed to HolderTuple

test_poisson_2indep has etest_kwds in signature but missing in docstring

created time in 8 days

push eventstatsmodels/statsmodels.github.io

commit sha 81147be09853f7a20de9b0d60dea0cf2d8fd3062

Update docs after building Travis build 12367 of statsmodels/statsmodels The docs were built from the branch 'master' against the commit 43287058a9aa054922df9161dbe720bd4fc3264e. The Travis build that generated this commit is at https://travis-ci.org/statsmodels/statsmodels/jobs/726362862. The doctr command that was run is /home/travis/miniconda/envs/statsmodels-test/bin/doctr deploy --built-docs docs/build/html/ --deploy-repo statsmodels/statsmodels.github.io devel

push time in 9 days

looks pretty bad unit test fail all over the place (but pytest output in this case is much too verbose to get an overview)

one case in missing handling in `statsmodels.formula.tests.test_formula.TestFormulaDict`

another case, MixedLM uses an allowed list

``````_allowed_kwargs = ["missing_idx", "design_info", "formula"]
for x in kwargs.keys():
if x not in _allowed_kwargs:
>               raise ValueError(
"argument %s not permitted for MixedLM initialization" % x)
E               ValueError: argument frame not permitted for MixedLM initialization
``````
josef-pkt

comment created time in 13 days

I made some changes in how `frame` is attached to model.data in formula handling. This might need more review. Now, only numerical or pandas ordered Categorical are allowed as endog in formulas, anything else should raise a ValueError.

josef-pkt

comment created time in 13 days

push eventjosef-pkt/statsmodels

ENH: properly connect param_names, give pandas codes endog to super

commit sha 6fc1e655fc3aa4b9d1e01edbdbe65b5eabd1d01f

ENH/REF: support pd ordered Categorical in formula

push time in 13 days

tricky: `model.data.frame` and `formula` is not available in a `model.__init__` because `from_formula` adds the attribute after creating the instance. I cannot use the attribute to change things inside `__init__`.

``````        mod = cls(endog, exog, *args, **kwargs)
mod.formula = formula

# since we got a dataframe, attach the original
mod.data.frame = data
return mod
``````
josef-pkt

comment created time in 13 days

I tried subclassing `LikelihoodModel`, but it fails because there are no default numerical derivative in `score`

josef-pkt

comment created time in 13 days

I'd like patsy to leave endog alone. the `model.__init__` should get the original endog except for dropping observations that have missing values in data for exog.

josef-pkt

comment created time in 13 days

I have an uncommitted change in `__init__` that converts 2-dim endog back to a 1-dim array. This works in the examples, however, it's too risky to let in unintended cases through patsy's formula handling. The only case I would like to support is using a pandas categorical endog and numerical endog in formulas. That looks a bit messy to implement.

``````            elif self.endog.ndim == 2:
# TODO: check if we really want to do this from formula
k_levels = self.endog.shape[1]
self.endog = self.endog.argmax(1)
labels = [str(i) for i in range(k_levels)]
# fix yname
self.data.ynames = self.data.ynames[0].split("[")[0]
``````

One problem is to access the original endog before patsy transformed it, currently that's difficult. Although, I just saw that we store the original dataframe in `mod_probf.data.frame`.

I guess the cleanest solution would be to add the option for not using patsy for endog to `statsmodels.formula.formulatools. handle_formula_data` which is called by `Model.__init__`

also, I can add a user facing `set_labels` method to `OrderedModel`, so users can get proper labels even with integer endog. That needs to change the threshold names in `param_names`.

josef-pkt

comment created time in 13 days

push eventstatsmodels/statsmodels.github.io

commit sha 7c22d924863088a34aedbbda6f10eb332eb0c158

Update docs after building Travis build 12362 of statsmodels/statsmodels The docs were built from the branch 'master' against the commit 7d727fd1bd82a761984e7374365709a6f004a8af. The Travis build that generated this commit is at https://travis-ci.org/statsmodels/statsmodels/jobs/724895593. The doctr command that was run is /home/travis/miniconda/envs/statsmodels-test/bin/doctr deploy --built-docs docs/build/html/ --deploy-repo statsmodels/statsmodels.github.io devel

push time in 13 days

push eventstatsmodels/statsmodels.github.io

commit sha 6872f5526f739f7f4a76d55d745fed46a5654184

Update docs after building Travis build 12359 of statsmodels/statsmodels The docs were built from the branch 'master' against the commit 04d66900191eab72a479753d1114c044c3520252. The Travis build that generated this commit is at https://travis-ci.org/statsmodels/statsmodels/jobs/724862700. The doctr command that was run is /home/travis/miniconda/envs/statsmodels-test/bin/doctr deploy --built-docs docs/build/html/ --deploy-repo statsmodels/statsmodels.github.io devel

push time in 13 days

I'm going to allow pandas endog that is not Categorical. Any reason against it?

``````data_student['apply_codes'] = data_student['apply'].cat.codes
mod_probp = OrderedModel(data_student['apply_codes'],
data_student[['pared', 'public', 'gpa']],
distr='probit')
res_probp = mod_probp.fit(method='bfgs')
print(res_probp.summary())
``````
``````                             OrderedModel Results
==============================================================================
Dep. Variable:            apply_codes   Log-Likelihood:                -358.75
Model:                   OrderedModel   AIC:                             723.5
Method:            Maximum Likelihood   BIC:                             735.5
Date:                Sat, 05 Sep 2020
Time:                        18:23:52
No. Observations:                 400
Df Residuals:                     397
Df Model:                           2
==============================================================================
coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
pared          0.5981      0.158      3.789      0.000       0.289       0.908
public         0.0102      0.173      0.059      0.953      -0.329       0.349
gpa            0.3582      0.157      2.285      0.022       0.051       0.665
0/1            1.2968      0.468      2.774      0.006       0.381       2.213
1/2            0.1873      0.074      2.530      0.011       0.042       0.332
==============================================================================
``````
Bolzano-Weierstrass

comment created time in 15 days

push eventjosef-pkt/statsmodels

ENH: properly connect param_names, give pandas codes endog to super

push time in 15 days

Thanks,

Missing values are given the code -1

This means our nan/missing detection will not find it. I will raise an exception if `codes.min() == -1`, because we cannot support it in the current structure and code flow. `-1` would still be valid in indexing, but index the wrong thing, I guess without raising an exception in the computations.

Bolzano-Weierstrass

comment created time in 15 days

@Bolzano-Weierstrass question on using pandas Categorical.

I would like to get the `codes` as Series with the same index and name as the original Categorical Series, something like `endog =`data_student['apply'].cat.codes`instead of`endog = np.asarray(endog.values.codes)`

Does it preserve the index and missing values? In the example `.cat.codes` doesn't have the series `name` anymore. I guess we can copy it directly.

The advantage would be that super classes of OrderedModel will use the standard data handling for pandas endog, e.g. attaching the `name` if available. The index might be used with the missing handling, but I'm not sure about that. Also using that in formulas would preserve the pandas name, e.g. use `cat` instead of `values` in the unit test example that uses `data['apply'].values.codes`

I found `.cat.codes` by google search and have no experience with pandas.Categorial

Bolzano-Weierstrass

comment created time in 15 days

similarly `model.endog_names`

e.g. `io.lib.summary._getnames` checks model endog_names and exog_names, and makes up generic names if not available.

endog_names and exog_names are properties in `base.Model` that return the `model.data` attribute, e.g.

``````@property
def endog_names(self):
"""
Names of endogenous variables.
"""
return self.data.ynames
``````
josef-pkt

comment created time in 15 days

push eventjosef-pkt/statsmodels

commit sha 25f20d81b4a5901ec333260c99f2039943f37e37

ENH: properly connect param_names

push time in 16 days

maybe if exog is None, then xname/exog_name/param_names should be an empty list instead of None. Then, we could more easily extend to adding extra params, e.g. I needed this also in the no-exog null model for `OrderedModel`

``````        # from GenericLikelihoodModel.fit
if self.exog is not None:
self.exog_names.extend(threshold_names)
else:
self.data.xnames = threshold_names
``````
josef-pkt

comment created time in 16 days

push eventjosef-pkt/statsmodels

push time in 16 days

push eventjosef-pkt/statsmodels

commit sha fb7fc4870a1e0809c77443596bec50369ee363dd

push time in 16 days

param_names

``````mod_prob.data.param_names
['x1', 'x2', 'x3', 'par0', 'par1']
``````

this is currently default inherited by GenericLikelihoodModel. extra params, if missing, are added automatically in `GenericLikelihoodModel.fit`. Before calling fit the extra params are missing in param_names

pandas xnames are not used ? `OrderedModel._check_inputs` uses asarray before super can handle the data

josef-pkt

comment created time in 16 days

now wald tests use `cov_names, even more options to store and set names

changed in #6012 to support 2-d params

model specific `names` shouldn't really be in `data` to make it easier to set them as model attributes, e.g. extra params that are not related to exog.

My current problem is that I need to figure out where to add the extra params for a new model, and I don't remember or I'm not up-to-date. OrderedModel in #7021 (with 1-dim params)

josef-pkt

comment created time in 16 days

I'm going to change the endog handling a bit to use more pandas ordered categorical, i.e. use it instead of np.unique and store/attach it to the model. (Maybe add a short cut for the case when endog is an integer array with `range` values, e.g for bootstrapping) I guess pandas Categorical should be stable enough by now.

I used pandas for the pred_table, instead of histogram2d as in MNLogit.

`model.endog` remains `range` int because I use it for indexing.

josef-pkt

comment created time in 16 days

push eventjosef-pkt/statsmodels

commit sha 7a3d25193079b2bd688474212b11b18190fa9467

push time in 16 days

looks good to me, but I want to check it out because we don't have proper unit tests for summary

A smoke test for the slim option should be added to the unit test, or one that assert the partial content. AFAIR we have just one or a few of those.

janosbiro

comment created time in 16 days

confusion table (a quick try)

ordinal probit never predicts the smallest level, category 3 with 0.1 fraction in sample

``````counts = pd.DataFrame(np.column_stack((mod_prob.endog, res_prob.predict().argmax(1))), columns=["choice", "predicted"])
totals=pd.crosstab(counts['choice'], counts['predicted'], margins=True).reset_index()
print(totals)

predicted choice    0   1  All
0              0  202  18  220
1              1  112  28  140
2              2   27  13   40
3            All  341  59  400
``````

reformatted by hand

predicted \ choice 0 1 All
0 202 18 220
1 112 28 140
2 27 13 40
All 341 59 400
josef-pkt

comment created time in 17 days

overall observed fraction versus sum of probabilities. I don't know if those should be equal theoretically.

``````(res_prob.model.endog[:, None] == np.arange(3)).mean(0), res_prob.predict().mean(0)
(array([0.55, 0.35, 0.1 ]), array([0.55035823, 0.34945402, 0.10018775]))
``````

diagnostic later, in analogy to Hosmer Lemeshow we could check this for subsets of the data. (issue for binary models ?)

josef-pkt

comment created time in 17 days

resid and related not available GenericLikelihoodModel doesn't have a default `resid`

problem endog is numeric category labels and not choice dummy set

``````res_prob.resid
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-36-f2805ea7e4ed> in <module>
----> 1 res_prob.resid
AttributeError: 'OrderedResults' object has no attribute 'resid'

res_prob.model.endog - res_prob.predict()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-37-8edd3f44288b> in <module>
----> 1 res_prob.model.endog - res_prob.predict()
ValueError: operands could not be broadcast together with shapes (400,) (400,3)

res_prob.fittedvalues
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-39-488b948830d8> in <module>
----> 1 res_prob.fittedvalues
AttributeError: 'OrderedResults' object has no attribute 'fittedvalues'

(res_prob.model.endog[:, None] == np.arange(3)) - res_prob.predict()
array([[-0.55141664, -0.35768687,  0.90910351],
[-0.32600889,  0.5511176 , -0.22510871],
[ 0.76503143, -0.45063696, -0.31439447],
...,

``````

I haven't checked yet what would make sense here

josef-pkt

comment created time in 17 days

current parameter names are misleading because those are not the internal names

e.g. summary shows "gpa", internal is "x3"

`res_prob.t_test("gpa=0")` this raises exception, unkown token `res_prob.t_test("x3=0")` this works

josef-pkt

comment created time in 17 days

df_resid looks wrong, aic, bic of null model is nan, even though there are two parameters and llf looks fine

full model: df_resid is 400 - 3 and doesn't count the 2 threshold parameters, df_model subtracts 1 for constant ! (by default) null model: df_model is nan

``````res_prob.df_resid, res_prob.df_model, res_prob.model.df_resid, res_prob.model.df_model
(397.0, 2.0, 397.0, 2.0)
res_prob_null.df_resid, res_prob_null.df_model, res_prob_null.model.df_resid, res_prob_null.model.df_model
(nan, nan, nan, nan)

res_prob_null.llf, res_prob.llf
(-370.60264131581323, -358.74756379018913)
``````
josef-pkt

comment created time in 17 days

Stata ologit allows for offset and constraints

I added offset now, and it can be used to estimate the null model (currently only params of null results are tested)

`score_obs` based on numerical differences is available from GenericLikelihoodModel, but it will not be available when we change super class to LikelihoodModel. I needed to add `loglikeobs` (no underline `_obs`)

with inherited score_obs, we inherit generic robust cov `res_prob_hc = mod_prob.fit(method='bfgs', cov_type="HC0")` works needs unit tests

Stata allows cov type "robust" (i.e. HC1 IIRC) and "cluster"

josef-pkt

comment created time in 17 days

push eventjosef-pkt/statsmodels

commit sha 11571a808d02d8611e11474b088730e739b22ea2

ENH/REF: add offset, scoreobs, DRY loglike and related

push time in 17 days

The property works fine in regular cases. But when we run into optimization problems, then it would be nice to be able to switch to adding options for the start_params computation.

Right now we cannot use a start_params method that is callable because it conflicts with the generic pattern.

josef-pkt

comment created time in 17 days

problem with derivatives, score with respect to threshold parameters

Because I use the exp increment parameterization for the constant in the integration bounds, the derivatives are different from Greene. a change in one threshold parameter affect all higher integration bounds and so derivative for those is not zero. Without the incremental parameterization a treshold parameter affects only two bounds.

so for score I still have to compute all probabilities, also those that belong to a different interval than the observation.

One possibility would be to use the exp increment parameterization only during the optimization with numerical derivatives, and get score and hessian for the simple threshold constant for post-estimation. We do that in some count models.

Or use analytical score for exog parameters (which I have now) and numerical derivatives for threshold parameters. I did this somewhere. maybe (?) in count models

josef-pkt

comment created time in 17 days

issue openedstatsmodels/statsmodels

Both LikelihoodModel and GenericLikelihoodModel check for a `start_params` attribute or property.

We could allow this to be a callable method by adding a check `callable(self.start_params)`

I thought there are some issues for this, but don't find any.

in count models GeneralizedPoisson and NegativeBinomialP have preliminary start_params models inside fit ZeroInflated calls `_get_start_params` after some preliminary setup miscmodel.TModel calls `_set_start_params` from `__init__` to set the `start_params` attribute

created time in 17 days

`+# -*- coding: utf-8 -*-+"""+Created on Sat Aug 22 20:24:42 2015++Author: Josef Perktold+License: BSD-3+"""++import numpy as np+import pandas as pd+from pandas.api.types import CategoricalDtype+from scipy import stats+from statsmodels.base.model import GenericLikelihoodModel, \+    GenericLikelihoodModelResults+from statsmodels.compat.pandas import Appender+++class OrderedModel(GenericLikelihoodModel):+    """Ordinal Model based on logistic or normal distribution++    The parameterization corresponds to the proportional odds model.++    The mode assumes that the endogenous variable is ordered but that the+    labels have no numeric interpretation besides the ordering.++    The model is based on a latent linear variable, where we observe only++    y_latent = X beta + u++    The observed variable is defined by the interval++    y = {0 if y_latent <= cut_0+         1 of cut_0 < y_latent <= cut_1+         ...+         K if cut_K < y_latent++    The probability of observing y=k conditional on the explanatory variables+    X is given by++    prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1)+                    = Prob(cut_k - x beta < u <= cut_k+1 - x beta+                    = F(cut_k+1 - x beta) - F(cut_k - x beta)++    Where F is the cumulative distribution of u which is either the normal+    or the logistic distribution, but can be set to any other continuous+    distribution. We use standardized distributions to avoid identifiability+    problems.+++    Parameters+    ----------+    endog : array_like+        endogenous or dependent ordered categorical variable with k levels.+        Labels or values of endog will internally transformed to consecutive+        integers, 0, 1, 2, ...+        pd.Series with Categorical as dtype should be preferred as it gives+        the order relation between the levels.+    exog : array_like+        exogenous explanatory variables. This should not include an intercept.+        (TODO: verify)+        pd.DataFrame are also accepted.+    distr : string 'probit' or 'logit', or a distribution instance+        The default is currently 'probit' which uses the normal distribution+        and corresponds to an ordered Probit model. The distribution is+        assumed to have the main methods of scipy.stats distributions, mainly+        cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate+        starting values.++    Status: initial version, subclasses `GenericLikelihoodModel`++    """++    def __init__(self, endog, exog, distr='probit', **kwds):++        if distr == 'probit':+            self.distr = stats.norm+        elif distr == 'logit':+            self.distr = stats.logistic+        else:+            self.distr = distr++        self.names, endog, exog = self._check_inputs(endog, exog)++        super(OrderedModel, self).__init__(endog, exog, **kwds)++        unique, index = np.unique(self.endog, return_inverse=True)+        self.k_levels = len(unique)+        self.endog = index+        self.labels = unique++        self.k_vars = self.exog.shape[1]+        self.results_class = OrderedResults++    def _check_inputs(self, endog, exog):+        """+        checks if self.distrib is legal and does the Pandas+        support for endog and exog. Also retrieves columns & categories+        names for .summary() of the results class.+        """+        names = {}+        if not isinstance(self.distr, stats.rv_continuous):+            msg = (+                f"{self.distr.name} must be a scipy.stats distribution."+            )+            raise ValueError(msg)++        # Pandas' support+        if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)):+            exog_name = ([exog.name] if isinstance(exog, pd.Series)+                         else exog.columns.tolist())+            names['xname'] = exog_name+            exog = np.asarray(exog)++        if isinstance(endog, pd.Series):+            if isinstance(endog.dtypes, CategoricalDtype):+                if not endog.dtype.ordered:+                    import warnings+                    warnings.warn("the endog has ordered == False, "+                                  "risk of capturing a wrong order for the "+                                  "categories. ordered == True preferred.",+                                  Warning)+                endog_name = endog.name+                threshold_name = [str(x) + '/' + str(y)+                                  for x, y in zip(endog.values.categories[:-1],+                                                  endog.values.categories[1:])]+                names['yname'] = endog_name+                names['xname'] = names['xname'] + threshold_name+                endog = np.asarray(endog.values.codes)+            else:+                msg = (+                    "If the endog is a pandas.Serie "+                    "it must be of categoricalDtype."+                )+                raise ValueError(msg)++        return names, endog, exog++    def cdf(self, x):+        """cdf evaluated at x+        """+        return self.distr.cdf(x)++    def prob(self, low, upp):+        """interval probability+        """+        return np.maximum(self.cdf(upp) - self.cdf(low), 0)++    def transform_threshold_params(self, params):+        """transformation of the parameters in the optimization++        Parameters+        ----------+        params : nd_array+            contains (exog_coef, transformed_thresholds) where exog_coef are+            the coefficient for the explanatory variables in the linear term,+            transformed threshold or cutoff points. The first, lowest threshold+            is unchanged, all other thresholds are in terms of exponentiated+            increments++        Returns+        -------+        thresh : nd_array+            thresh are the thresholds or cutoff constants for the intervals.++        """+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        return thresh++    def transform_reverse_threshold_params(self, params):+        """obtain transformed thresholds from original thresholds, cutoff+        constants.++        """+        start_ppf = params+        thresh_params = np.concatenate((start_ppf[:1],+                                        np.log(np.diff(start_ppf[:-1]))))+        return thresh_params++    def predict(self, params, exog=None):+        """predicted probabilities for each level of the ordinal endog.+++        """+        # structure of params = [beta, constants_or_thresholds]++        # explicit in several steps to avoid bugs+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        xb = self.exog.dot(params[:-(self.k_levels - 1)])[:, None]+        low = thresh[:-1] - xb+        upp = thresh[1:] - xb+        prob = self.prob(low, upp)+        return prob++    def loglike(self, params):++        # structure of params = [beta, constants_or_thresholds]++        thresh = np.concatenate(([-np.inf],+                                 params[-(self.k_levels - 1):], [np.inf]))++        # explicit in several steps to avoid bugs+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        thresh_i_low = thresh[self.endog]+        thresh_i_upp = thresh[self.endog + 1]+        xb = self.exog.dot(params[:-(self.k_levels - 1)])`

outsource linpred, then we need to add offset only at a single method.

josef-pkt

comment created time in 17 days

PullRequestReviewEvent

`+# -*- coding: utf-8 -*-+"""+Created on Sat Aug 22 20:24:42 2015++Author: Josef Perktold+License: BSD-3+"""++import numpy as np+import pandas as pd+from pandas.api.types import CategoricalDtype+from scipy import stats+from statsmodels.base.model import GenericLikelihoodModel, \+    GenericLikelihoodModelResults+from statsmodels.compat.pandas import Appender+++class OrderedModel(GenericLikelihoodModel):+    """Ordinal Model based on logistic or normal distribution++    The parameterization corresponds to the proportional odds model.++    The mode assumes that the endogenous variable is ordered but that the+    labels have no numeric interpretation besides the ordering.++    The model is based on a latent linear variable, where we observe only++    y_latent = X beta + u++    The observed variable is defined by the interval++    y = {0 if y_latent <= cut_0+         1 of cut_0 < y_latent <= cut_1+         ...+         K if cut_K < y_latent++    The probability of observing y=k conditional on the explanatory variables+    X is given by++    prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1)+                    = Prob(cut_k - x beta < u <= cut_k+1 - x beta+                    = F(cut_k+1 - x beta) - F(cut_k - x beta)++    Where F is the cumulative distribution of u which is either the normal+    or the logistic distribution, but can be set to any other continuous+    distribution. We use standardized distributions to avoid identifiability+    problems.+++    Parameters+    ----------+    endog : array_like+        endogenous or dependent ordered categorical variable with k levels.+        Labels or values of endog will internally transformed to consecutive+        integers, 0, 1, 2, ...+        pd.Series with Categorical as dtype should be preferred as it gives+        the order relation between the levels.+    exog : array_like+        exogenous explanatory variables. This should not include an intercept.+        (TODO: verify)+        pd.DataFrame are also accepted.+    distr : string 'probit' or 'logit', or a distribution instance+        The default is currently 'probit' which uses the normal distribution+        and corresponds to an ordered Probit model. The distribution is+        assumed to have the main methods of scipy.stats distributions, mainly+        cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate+        starting values.++    Status: initial version, subclasses `GenericLikelihoodModel`++    """++    def __init__(self, endog, exog, distr='probit', **kwds):++        if distr == 'probit':+            self.distr = stats.norm+        elif distr == 'logit':+            self.distr = stats.logistic+        else:+            self.distr = distr++        self.names, endog, exog = self._check_inputs(endog, exog)++        super(OrderedModel, self).__init__(endog, exog, **kwds)++        unique, index = np.unique(self.endog, return_inverse=True)+        self.k_levels = len(unique)+        self.endog = index+        self.labels = unique++        self.k_vars = self.exog.shape[1]+        self.results_class = OrderedResults++    def _check_inputs(self, endog, exog):+        """+        checks if self.distrib is legal and does the Pandas+        support for endog and exog. Also retrieves columns & categories+        names for .summary() of the results class.+        """+        names = {}+        if not isinstance(self.distr, stats.rv_continuous):+            msg = (+                f"{self.distr.name} must be a scipy.stats distribution."+            )+            raise ValueError(msg)++        # Pandas' support+        if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)):+            exog_name = ([exog.name] if isinstance(exog, pd.Series)+                         else exog.columns.tolist())+            names['xname'] = exog_name+            exog = np.asarray(exog)++        if isinstance(endog, pd.Series):+            if isinstance(endog.dtypes, CategoricalDtype):+                if not endog.dtype.ordered:+                    import warnings+                    warnings.warn("the endog has ordered == False, "+                                  "risk of capturing a wrong order for the "+                                  "categories. ordered == True preferred.",+                                  Warning)+                endog_name = endog.name+                threshold_name = [str(x) + '/' + str(y)+                                  for x, y in zip(endog.values.categories[:-1],+                                                  endog.values.categories[1:])]+                names['yname'] = endog_name+                names['xname'] = names['xname'] + threshold_name+                endog = np.asarray(endog.values.codes)+            else:+                msg = (+                    "If the endog is a pandas.Serie "+                    "it must be of categoricalDtype."+                )+                raise ValueError(msg)++        return names, endog, exog++    def cdf(self, x):+        """cdf evaluated at x+        """+        return self.distr.cdf(x)++    def prob(self, low, upp):+        """interval probability+        """+        return np.maximum(self.cdf(upp) - self.cdf(low), 0)++    def transform_threshold_params(self, params):+        """transformation of the parameters in the optimization++        Parameters+        ----------+        params : nd_array+            contains (exog_coef, transformed_thresholds) where exog_coef are+            the coefficient for the explanatory variables in the linear term,+            transformed threshold or cutoff points. The first, lowest threshold+            is unchanged, all other thresholds are in terms of exponentiated+            increments++        Returns+        -------+        thresh : nd_array+            thresh are the thresholds or cutoff constants for the intervals.++        """+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        return thresh++    def transform_reverse_threshold_params(self, params):+        """obtain transformed thresholds from original thresholds, cutoff+        constants.++        """+        start_ppf = params+        thresh_params = np.concatenate((start_ppf[:1],+                                        np.log(np.diff(start_ppf[:-1]))))+        return thresh_params++    def predict(self, params, exog=None):+        """predicted probabilities for each level of the ordinal endog.+++        """+        # structure of params = [beta, constants_or_thresholds]++        # explicit in several steps to avoid bugs+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        xb = self.exog.dot(params[:-(self.k_levels - 1)])[:, None]+        low = thresh[:-1] - xb+        upp = thresh[1:] - xb`

maybe not here predict computes all integration bounds, loglike only two per observation

in predict low up could be selected after computing `thresh - xb` instead of computing it twice

josef-pkt

comment created time in 17 days

PullRequestReviewEvent

I opened issue #7023 to see whether we can easily convert link classes to distribution classes (difference in method names) open for later

josef-pkt

comment created time in 17 days

issue openedstatsmodels/statsmodels

ordinal models #7021 require a distribution for the latent variable (cdf, ppf, pdf) Typical cases are the same as links for Binomial.

We could add a wrapper class that converts link classes to distribution classes with just the main methods. Link classes already have relevant derivatives.

created time in 17 days

Probably because that run tests from an install, not from the source tree.

Good that we have some CI that does this, otherwise we might not check for missing files in install distribution.

josef-pkt

comment created time in 17 days

`+# -*- coding: utf-8 -*-+"""+Created on Sat Aug 22 20:24:42 2015++Author: Josef Perktold+License: BSD-3+"""++import numpy as np+import pandas as pd+from pandas.api.types import CategoricalDtype+from scipy import stats+from statsmodels.base.model import GenericLikelihoodModel, \+    GenericLikelihoodModelResults+from statsmodels.compat.pandas import Appender+++class OrderedModel(GenericLikelihoodModel):+    """Ordinal Model based on logistic or normal distribution++    The parameterization corresponds to the proportional odds model.++    The mode assumes that the endogenous variable is ordered but that the+    labels have no numeric interpretation besides the ordering.++    The model is based on a latent linear variable, where we observe only++    y_latent = X beta + u++    The observed variable is defined by the interval++    y = {0 if y_latent <= cut_0+         1 of cut_0 < y_latent <= cut_1+         ...+         K if cut_K < y_latent++    The probability of observing y=k conditional on the explanatory variables+    X is given by++    prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1)+                    = Prob(cut_k - x beta < u <= cut_k+1 - x beta+                    = F(cut_k+1 - x beta) - F(cut_k - x beta)++    Where F is the cumulative distribution of u which is either the normal+    or the logistic distribution, but can be set to any other continuous+    distribution. We use standardized distributions to avoid identifiability+    problems.+++    Parameters+    ----------+    endog : array_like+        endogenous or dependent ordered categorical variable with k levels.+        Labels or values of endog will internally transformed to consecutive+        integers, 0, 1, 2, ...+        pd.Series with Categorical as dtype should be preferred as it gives+        the order relation between the levels.+    exog : array_like+        exogenous explanatory variables. This should not include an intercept.+        (TODO: verify)+        pd.DataFrame are also accepted.+    distr : string 'probit' or 'logit', or a distribution instance+        The default is currently 'probit' which uses the normal distribution+        and corresponds to an ordered Probit model. The distribution is+        assumed to have the main methods of scipy.stats distributions, mainly+        cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate+        starting values.++    Status: initial version, subclasses `GenericLikelihoodModel`++    """++    def __init__(self, endog, exog, distr='probit', **kwds):++        if distr == 'probit':+            self.distr = stats.norm+        elif distr == 'logit':+            self.distr = stats.logistic+        else:+            self.distr = distr++        self.names, endog, exog = self._check_inputs(endog, exog)++        super(OrderedModel, self).__init__(endog, exog, **kwds)++        unique, index = np.unique(self.endog, return_inverse=True)+        self.k_levels = len(unique)+        self.endog = index+        self.labels = unique++        self.k_vars = self.exog.shape[1]+        self.results_class = OrderedResults++    def _check_inputs(self, endog, exog):+        """+        checks if self.distrib is legal and does the Pandas+        support for endog and exog. Also retrieves columns & categories+        names for .summary() of the results class.+        """+        names = {}+        if not isinstance(self.distr, stats.rv_continuous):+            msg = (+                f"{self.distr.name} must be a scipy.stats distribution."+            )+            raise ValueError(msg)++        # Pandas' support+        if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)):+            exog_name = ([exog.name] if isinstance(exog, pd.Series)+                         else exog.columns.tolist())+            names['xname'] = exog_name+            exog = np.asarray(exog)++        if isinstance(endog, pd.Series):+            if isinstance(endog.dtypes, CategoricalDtype):+                if not endog.dtype.ordered:+                    import warnings+                    warnings.warn("the endog has ordered == False, "+                                  "risk of capturing a wrong order for the "+                                  "categories. ordered == True preferred.",+                                  Warning)+                endog_name = endog.name+                threshold_name = [str(x) + '/' + str(y)+                                  for x, y in zip(endog.values.categories[:-1],+                                                  endog.values.categories[1:])]+                names['yname'] = endog_name+                names['xname'] = names['xname'] + threshold_name+                endog = np.asarray(endog.values.codes)+            else:+                msg = (+                    "If the endog is a pandas.Serie "+                    "it must be of categoricalDtype."+                )+                raise ValueError(msg)++        return names, endog, exog++    def cdf(self, x):+        """cdf evaluated at x+        """+        return self.distr.cdf(x)++    def prob(self, low, upp):+        """interval probability+        """+        return np.maximum(self.cdf(upp) - self.cdf(low), 0)`

need something analogous for pdf for score and hessian

josef-pkt

comment created time in 17 days

`+# -*- coding: utf-8 -*-+"""+Created on Sat Aug 22 20:24:42 2015++Author: Josef Perktold+License: BSD-3+"""++import numpy as np+import pandas as pd+from pandas.api.types import CategoricalDtype+from scipy import stats+from statsmodels.base.model import GenericLikelihoodModel, \+    GenericLikelihoodModelResults+from statsmodels.compat.pandas import Appender+++class OrderedModel(GenericLikelihoodModel):+    """Ordinal Model based on logistic or normal distribution++    The parameterization corresponds to the proportional odds model.++    The mode assumes that the endogenous variable is ordered but that the+    labels have no numeric interpretation besides the ordering.++    The model is based on a latent linear variable, where we observe only++    y_latent = X beta + u++    The observed variable is defined by the interval++    y = {0 if y_latent <= cut_0+         1 of cut_0 < y_latent <= cut_1+         ...+         K if cut_K < y_latent++    The probability of observing y=k conditional on the explanatory variables+    X is given by++    prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1)+                    = Prob(cut_k - x beta < u <= cut_k+1 - x beta+                    = F(cut_k+1 - x beta) - F(cut_k - x beta)++    Where F is the cumulative distribution of u which is either the normal+    or the logistic distribution, but can be set to any other continuous+    distribution. We use standardized distributions to avoid identifiability+    problems.+++    Parameters+    ----------+    endog : array_like+        endogenous or dependent ordered categorical variable with k levels.+        Labels or values of endog will internally transformed to consecutive+        integers, 0, 1, 2, ...+        pd.Series with Categorical as dtype should be preferred as it gives+        the order relation between the levels.+    exog : array_like+        exogenous explanatory variables. This should not include an intercept.+        (TODO: verify)+        pd.DataFrame are also accepted.+    distr : string 'probit' or 'logit', or a distribution instance+        The default is currently 'probit' which uses the normal distribution+        and corresponds to an ordered Probit model. The distribution is+        assumed to have the main methods of scipy.stats distributions, mainly+        cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate+        starting values.++    Status: initial version, subclasses `GenericLikelihoodModel`++    """++    def __init__(self, endog, exog, distr='probit', **kwds):++        if distr == 'probit':+            self.distr = stats.norm+        elif distr == 'logit':+            self.distr = stats.logistic+        else:+            self.distr = distr++        self.names, endog, exog = self._check_inputs(endog, exog)++        super(OrderedModel, self).__init__(endog, exog, **kwds)++        unique, index = np.unique(self.endog, return_inverse=True)+        self.k_levels = len(unique)+        self.endog = index+        self.labels = unique++        self.k_vars = self.exog.shape[1]+        self.results_class = OrderedResults++    def _check_inputs(self, endog, exog):+        """+        checks if self.distrib is legal and does the Pandas+        support for endog and exog. Also retrieves columns & categories+        names for .summary() of the results class.+        """+        names = {}+        if not isinstance(self.distr, stats.rv_continuous):+            msg = (+                f"{self.distr.name} must be a scipy.stats distribution."+            )+            raise ValueError(msg)++        # Pandas' support+        if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)):+            exog_name = ([exog.name] if isinstance(exog, pd.Series)+                         else exog.columns.tolist())+            names['xname'] = exog_name+            exog = np.asarray(exog)++        if isinstance(endog, pd.Series):+            if isinstance(endog.dtypes, CategoricalDtype):+                if not endog.dtype.ordered:+                    import warnings+                    warnings.warn("the endog has ordered == False, "+                                  "risk of capturing a wrong order for the "+                                  "categories. ordered == True preferred.",+                                  Warning)+                endog_name = endog.name+                threshold_name = [str(x) + '/' + str(y)+                                  for x, y in zip(endog.values.categories[:-1],+                                                  endog.values.categories[1:])]+                names['yname'] = endog_name+                names['xname'] = names['xname'] + threshold_name+                endog = np.asarray(endog.values.codes)+            else:+                msg = (+                    "If the endog is a pandas.Serie "+                    "it must be of categoricalDtype."+                )+                raise ValueError(msg)++        return names, endog, exog++    def cdf(self, x):+        """cdf evaluated at x+        """+        return self.distr.cdf(x)++    def prob(self, low, upp):+        """interval probability+        """+        return np.maximum(self.cdf(upp) - self.cdf(low), 0)++    def transform_threshold_params(self, params):+        """transformation of the parameters in the optimization++        Parameters+        ----------+        params : nd_array+            contains (exog_coef, transformed_thresholds) where exog_coef are+            the coefficient for the explanatory variables in the linear term,+            transformed threshold or cutoff points. The first, lowest threshold+            is unchanged, all other thresholds are in terms of exponentiated+            increments++        Returns+        -------+        thresh : nd_array+            thresh are the thresholds or cutoff constants for the intervals.++        """+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        return thresh++    def transform_reverse_threshold_params(self, params):+        """obtain transformed thresholds from original thresholds, cutoff+        constants.++        """+        start_ppf = params+        thresh_params = np.concatenate((start_ppf[:1],+                                        np.log(np.diff(start_ppf[:-1]))))+        return thresh_params++    def predict(self, params, exog=None):+        """predicted probabilities for each level of the ordinal endog.+++        """+        # structure of params = [beta, constants_or_thresholds]++        # explicit in several steps to avoid bugs+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        xb = self.exog.dot(params[:-(self.k_levels - 1)])[:, None]`

BUG: this should use exog argument if not None, not self.exog

josef-pkt

comment created time in 17 days

`+# -*- coding: utf-8 -*-+"""+Created on Sat Aug 22 20:24:42 2015++Author: Josef Perktold+License: BSD-3+"""++import numpy as np+import pandas as pd+from pandas.api.types import CategoricalDtype+from scipy import stats+from statsmodels.base.model import GenericLikelihoodModel, \+    GenericLikelihoodModelResults+from statsmodels.compat.pandas import Appender+++class OrderedModel(GenericLikelihoodModel):+    """Ordinal Model based on logistic or normal distribution++    The parameterization corresponds to the proportional odds model.++    The mode assumes that the endogenous variable is ordered but that the+    labels have no numeric interpretation besides the ordering.++    The model is based on a latent linear variable, where we observe only++    y_latent = X beta + u++    The observed variable is defined by the interval++    y = {0 if y_latent <= cut_0+         1 of cut_0 < y_latent <= cut_1+         ...+         K if cut_K < y_latent++    The probability of observing y=k conditional on the explanatory variables+    X is given by++    prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1)+                    = Prob(cut_k - x beta < u <= cut_k+1 - x beta+                    = F(cut_k+1 - x beta) - F(cut_k - x beta)++    Where F is the cumulative distribution of u which is either the normal+    or the logistic distribution, but can be set to any other continuous+    distribution. We use standardized distributions to avoid identifiability+    problems.+++    Parameters+    ----------+    endog : array_like+        endogenous or dependent ordered categorical variable with k levels.+        Labels or values of endog will internally transformed to consecutive+        integers, 0, 1, 2, ...+        pd.Series with Categorical as dtype should be preferred as it gives+        the order relation between the levels.+    exog : array_like+        exogenous explanatory variables. This should not include an intercept.+        (TODO: verify)+        pd.DataFrame are also accepted.+    distr : string 'probit' or 'logit', or a distribution instance+        The default is currently 'probit' which uses the normal distribution+        and corresponds to an ordered Probit model. The distribution is+        assumed to have the main methods of scipy.stats distributions, mainly+        cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate+        starting values.++    Status: initial version, subclasses `GenericLikelihoodModel`++    """++    def __init__(self, endog, exog, distr='probit', **kwds):++        if distr == 'probit':+            self.distr = stats.norm+        elif distr == 'logit':+            self.distr = stats.logistic+        else:+            self.distr = distr++        self.names, endog, exog = self._check_inputs(endog, exog)++        super(OrderedModel, self).__init__(endog, exog, **kwds)++        unique, index = np.unique(self.endog, return_inverse=True)+        self.k_levels = len(unique)+        self.endog = index+        self.labels = unique++        self.k_vars = self.exog.shape[1]+        self.results_class = OrderedResults++    def _check_inputs(self, endog, exog):+        """+        checks if self.distrib is legal and does the Pandas+        support for endog and exog. Also retrieves columns & categories+        names for .summary() of the results class.+        """+        names = {}+        if not isinstance(self.distr, stats.rv_continuous):+            msg = (+                f"{self.distr.name} must be a scipy.stats distribution."+            )+            raise ValueError(msg)++        # Pandas' support+        if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)):+            exog_name = ([exog.name] if isinstance(exog, pd.Series)+                         else exog.columns.tolist())+            names['xname'] = exog_name+            exog = np.asarray(exog)++        if isinstance(endog, pd.Series):+            if isinstance(endog.dtypes, CategoricalDtype):+                if not endog.dtype.ordered:+                    import warnings+                    warnings.warn("the endog has ordered == False, "+                                  "risk of capturing a wrong order for the "+                                  "categories. ordered == True preferred.",+                                  Warning)+                endog_name = endog.name+                threshold_name = [str(x) + '/' + str(y)+                                  for x, y in zip(endog.values.categories[:-1],+                                                  endog.values.categories[1:])]+                names['yname'] = endog_name+                names['xname'] = names['xname'] + threshold_name`

exog does not need a model specific treatment. This should be handled by the generic framework. threshold_names should use `labels` in `__init__` independent of input format

josef-pkt

comment created time in 17 days

`+# -*- coding: utf-8 -*-+"""+Created on Sat Aug 22 20:24:42 2015++Author: Josef Perktold+License: BSD-3+"""++import numpy as np+import pandas as pd+from pandas.api.types import CategoricalDtype+from scipy import stats+from statsmodels.base.model import GenericLikelihoodModel, \+    GenericLikelihoodModelResults+from statsmodels.compat.pandas import Appender+++class OrderedModel(GenericLikelihoodModel):+    """Ordinal Model based on logistic or normal distribution++    The parameterization corresponds to the proportional odds model.++    The mode assumes that the endogenous variable is ordered but that the+    labels have no numeric interpretation besides the ordering.++    The model is based on a latent linear variable, where we observe only++    y_latent = X beta + u++    The observed variable is defined by the interval++    y = {0 if y_latent <= cut_0+         1 of cut_0 < y_latent <= cut_1+         ...+         K if cut_K < y_latent++    The probability of observing y=k conditional on the explanatory variables+    X is given by++    prob(y = k | x) = Prob(cut_k < y_latent <= cut_k+1)+                    = Prob(cut_k - x beta < u <= cut_k+1 - x beta+                    = F(cut_k+1 - x beta) - F(cut_k - x beta)++    Where F is the cumulative distribution of u which is either the normal+    or the logistic distribution, but can be set to any other continuous+    distribution. We use standardized distributions to avoid identifiability+    problems.+++    Parameters+    ----------+    endog : array_like+        endogenous or dependent ordered categorical variable with k levels.+        Labels or values of endog will internally transformed to consecutive+        integers, 0, 1, 2, ...+        pd.Series with Categorical as dtype should be preferred as it gives+        the order relation between the levels.+    exog : array_like+        exogenous explanatory variables. This should not include an intercept.+        (TODO: verify)+        pd.DataFrame are also accepted.+    distr : string 'probit' or 'logit', or a distribution instance+        The default is currently 'probit' which uses the normal distribution+        and corresponds to an ordered Probit model. The distribution is+        assumed to have the main methods of scipy.stats distributions, mainly+        cdf, pdf and ppf. The inverse cdf, ppf, is only use to calculate+        starting values.++    Status: initial version, subclasses `GenericLikelihoodModel`++    """++    def __init__(self, endog, exog, distr='probit', **kwds):++        if distr == 'probit':+            self.distr = stats.norm+        elif distr == 'logit':+            self.distr = stats.logistic+        else:+            self.distr = distr++        self.names, endog, exog = self._check_inputs(endog, exog)++        super(OrderedModel, self).__init__(endog, exog, **kwds)++        unique, index = np.unique(self.endog, return_inverse=True)+        self.k_levels = len(unique)+        self.endog = index+        self.labels = unique++        self.k_vars = self.exog.shape[1]+        self.results_class = OrderedResults++    def _check_inputs(self, endog, exog):+        """+        checks if self.distrib is legal and does the Pandas+        support for endog and exog. Also retrieves columns & categories+        names for .summary() of the results class.+        """+        names = {}+        if not isinstance(self.distr, stats.rv_continuous):+            msg = (+                f"{self.distr.name} must be a scipy.stats distribution."+            )+            raise ValueError(msg)++        # Pandas' support+        if (isinstance(exog, pd.DataFrame)) or (isinstance(exog, pd.Series)):+            exog_name = ([exog.name] if isinstance(exog, pd.Series)+                         else exog.columns.tolist())+            names['xname'] = exog_name+            exog = np.asarray(exog)++        if isinstance(endog, pd.Series):+            if isinstance(endog.dtypes, CategoricalDtype):+                if not endog.dtype.ordered:+                    import warnings+                    warnings.warn("the endog has ordered == False, "+                                  "risk of capturing a wrong order for the "+                                  "categories. ordered == True preferred.",+                                  Warning)+                endog_name = endog.name+                threshold_name = [str(x) + '/' + str(y)+                                  for x, y in zip(endog.values.categories[:-1],+                                                  endog.values.categories[1:])]+                names['yname'] = endog_name+                names['xname'] = names['xname'] + threshold_name+                endog = np.asarray(endog.values.codes)+            else:+                msg = (+                    "If the endog is a pandas.Serie "+                    "it must be of categoricalDtype."+                )+                raise ValueError(msg)++        return names, endog, exog++    def cdf(self, x):+        """cdf evaluated at x+        """+        return self.distr.cdf(x)++    def prob(self, low, upp):+        """interval probability+        """+        return np.maximum(self.cdf(upp) - self.cdf(low), 0)++    def transform_threshold_params(self, params):+        """transformation of the parameters in the optimization++        Parameters+        ----------+        params : nd_array+            contains (exog_coef, transformed_thresholds) where exog_coef are+            the coefficient for the explanatory variables in the linear term,+            transformed threshold or cutoff points. The first, lowest threshold+            is unchanged, all other thresholds are in terms of exponentiated+            increments++        Returns+        -------+        thresh : nd_array+            thresh are the thresholds or cutoff constants for the intervals.++        """+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        return thresh++    def transform_reverse_threshold_params(self, params):+        """obtain transformed thresholds from original thresholds, cutoff+        constants.++        """+        start_ppf = params+        thresh_params = np.concatenate((start_ppf[:1],+                                        np.log(np.diff(start_ppf[:-1]))))+        return thresh_params++    def predict(self, params, exog=None):+        """predicted probabilities for each level of the ordinal endog.+++        """+        # structure of params = [beta, constants_or_thresholds]++        # explicit in several steps to avoid bugs+        th_params = params[-(self.k_levels - 1):]+        thresh = np.concatenate((th_params[:1],+                                 np.exp(th_params[1:]))).cumsum()+        thresh = np.concatenate(([-np.inf], thresh, [np.inf]))+        xb = self.exog.dot(params[:-(self.k_levels - 1)])[:, None]+        low = thresh[:-1] - xb+        upp = thresh[1:] - xb`