Getting Started

Data load

[1]:
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.getOrCreate()

from pyspark import SparkFiles
from pyspark.sql.functions import col

url_X = 'https://raw.githubusercontent.com/seunghajeong/data_repo/master/simulation_data_x.csv'
url_y = 'https://raw.githubusercontent.com/seunghajeong/data_repo/master/simulation_data_y.csv'
spark.sparkContext.addFile(url_X)
X_Sparkdataframe = spark.read.csv(SparkFiles.get("simulation_data_x.csv"), header=True)
X_Sparkdataframe = X_Sparkdataframe.select(*(col(c).cast("float").alias(c) for c in X_Sparkdataframe.columns))
X = X_Sparkdataframe.toPandas()
spark.sparkContext.addFile(url_y)
y_Sparkdataframe = spark.read.csv(SparkFiles.get("simulation_data_y.csv"), header=True)
y_Sparkdataframe = y_Sparkdataframe.select(*(col(c).cast("float").alias(c) for c in y_Sparkdataframe.columns))
y = y_Sparkdataframe.toPandas()
[3]:
X.head()
[3]:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 ... V91 V92 V93 V94 V95 V96 V97 V98 V99 V100
0 0.101741 0.144909 0.235567 0.576186 0.299443 0.296395 0.902235 0.265811 0.420927 0.684045 ... -0.058677 0.497420 -1.124986 0.338215 -0.942943 -1.257044 -0.531471 1.236317 0.405682 0.387636
1 -0.105054 -0.110128 -0.033311 -0.042925 -0.752605 -0.794815 -1.699739 -1.891533 -1.287547 -1.154547 ... -1.614948 -1.337878 0.795742 1.101117 -0.920702 -0.098002 -0.269719 0.333092 -0.500367 1.340876
2 -0.478922 -0.058475 -0.620625 1.775435 0.935760 1.268173 -2.652118 -2.327299 -2.913926 -2.140256 ... 0.081620 0.999657 -0.594758 0.057804 -1.259650 0.321864 0.992930 0.552269 1.253700 0.974150
3 -0.657057 -0.508105 -0.556453 0.057045 -0.344814 -0.557824 -0.814844 0.016355 -0.384234 -0.022224 ... 0.077500 -1.489750 -0.151010 0.347814 2.281268 -1.275026 -0.141539 -0.335557 0.196004 -1.347782
4 -0.939275 -0.780565 -0.495687 -0.308105 -0.604543 -0.272711 0.080763 0.542582 0.580669 0.162638 ... 0.677662 0.188278 0.716616 -1.290398 -0.579556 -0.692827 -1.040820 -0.674525 1.355343 1.754870

5 rows × 100 columns

[4]:
y.head()
[4]:
V1
0 2.811113
1 1.049249
2 -4.496389
3 -3.846408
4 -2.805357

General Usage

[5]:
from Hi_LASSO_pyspark import HiLASSO_Spark

model = HiLASSO_Spark(X, y, alpha=0.05, q1='auto', q2='auto', L=30, cv=5, node='auto', logistic=False)
model.fit()
C:\Users\Seungha\anaconda3\lib\site-packages\sklearn\externals\joblib\__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=FutureWarning)
C:\Users\Seungha\anaconda3\lib\site-packages\sklearn\externals\six.py:31: FutureWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
  "(https://pypi.org/project/six/).", FutureWarning)
Procedure_1_fin.
Procedure_2_fin.
[5]:
<Hi_LASSO_pyspark.HiLASSO_Spark at 0x172ece75f48>
[6]:
model.coef_
[6]:
array([ 1.15905047e+00,  1.73054066e+00, -2.78546358e-01,  7.08538060e-01,
        2.93630644e-01, -3.09802967e-02,  9.87172481e-01,  4.98625094e-01,
        2.80438607e-01,  2.04938322e-03, -5.33703582e-01,  0.00000000e+00,
       -2.25060983e-02,  0.00000000e+00, -5.63347651e-01,  7.20865170e-03,
       -1.05214712e-02,  1.45435365e-02,  0.00000000e+00,  8.20807070e-03,
        2.18642615e-01, -5.03987466e-02,  0.00000000e+00,  7.34933400e-03,
        1.43663963e-01,  0.00000000e+00,  6.34305539e-03,  2.00344988e-02,
        3.13616708e-05,  6.35422045e-02, -2.01727289e-03,  0.00000000e+00,
        4.78273926e-02,  4.29157582e-01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -1.40783507e-02,  0.00000000e+00,  0.00000000e+00,
        8.87616324e-02,  0.00000000e+00,  3.38209725e-01,  0.00000000e+00,
       -4.18006928e-02,  3.83894299e-02, -3.01721439e-02,  0.00000000e+00,
        0.00000000e+00, -1.45920949e-03,  0.00000000e+00, -3.20830882e-02,
       -1.49872298e-01,  0.00000000e+00, -3.98133525e-03,  0.00000000e+00,
        2.45903335e-03,  7.68685032e-03, -5.65818992e-02,  1.12274264e-01,
        2.13740191e-03,  0.00000000e+00, -4.18373709e-04,  0.00000000e+00,
       -7.87575129e-04,  0.00000000e+00,  4.93090821e-02,  0.00000000e+00,
       -4.18499004e-04, -3.56030163e-04,  3.92035324e-02,  2.35306927e-01,
        1.71041040e-02,  0.00000000e+00,  1.03596675e-02,  0.00000000e+00,
        2.09837938e-02, -2.43313781e-03,  0.00000000e+00,  0.00000000e+00,
       -2.72681876e-01,  1.64381733e-01,  1.56982030e-01,  1.43836815e-03,
        4.66357974e-03,  0.00000000e+00, -1.12394212e+00, -8.07932918e-02,
        0.00000000e+00,  0.00000000e+00, -1.70966580e-03,  0.00000000e+00,
       -2.23018886e-03,  1.26032087e-01,  0.00000000e+00,  1.03304962e-01,
        0.00000000e+00,  0.00000000e+00, -5.00787433e-02,  8.02132494e-03])
[7]:
model.p_values
[7]:
array([5.45546950e-131, 2.01625779e-164, 1.31670825e-004, 8.80886752e-097,
       4.31004255e-027, 9.99999051e-001, 8.02834634e-123, 6.37878173e-048,
       6.27334431e-017, 1.00000000e+000, 1.70639215e-121, 1.00000000e+000,
       9.99733947e-001, 1.00000000e+000, 6.12580122e-098, 1.00000000e+000,
       1.00000000e+000, 9.99999704e-001, 1.00000000e+000, 1.00000000e+000,
       1.31517681e-007, 8.32544869e-001, 1.00000000e+000, 1.00000000e+000,
       2.85508681e-007, 1.00000000e+000, 1.00000000e+000, 9.99948594e-001,
       1.00000000e+000, 5.20943089e-003, 1.00000000e+000, 1.00000000e+000,
       1.85618624e-002, 3.49112719e-075, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       4.16256090e-004, 1.00000000e+000, 1.28847785e-054, 1.00000000e+000,
       5.31842478e-001, 2.19719125e-001, 9.81054785e-001, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 9.34796624e-001,
       2.85508681e-007, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.01690916e-001, 5.94884253e-008,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 8.73984401e-001, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 9.70377124e-001, 7.00544469e-023,
       9.99948594e-001, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       9.99997164e-001, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       2.73297075e-068, 8.03655110e-012, 1.39764626e-010, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 4.97483677e-211, 2.64252727e-008,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.39764626e-010, 1.00000000e+000, 4.94110926e-009,
       1.00000000e+000, 1.00000000e+000, 9.93078320e-001, 1.00000000e+000])
[8]:
model.selected_var
[8]:
array([ 1.15905047,  1.73054066, -0.27854636,  0.70853806,  0.29363064,
        0.        ,  0.98717248,  0.49862509,  0.28043861,  0.        ,
       -0.53370358,  0.        ,  0.        ,  0.        , -0.56334765,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.21864261,  0.        ,  0.        ,  0.        ,  0.14366396,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.42915758,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.08876163,  0.        ,  0.33820973,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        , -0.1498723 ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.11227426,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.23530693,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.27268188,  0.16438173,  0.15698203,  0.        ,  0.        ,
        0.        , -1.12394212, -0.08079329,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.12603209,  0.        ,
        0.10330496,  0.        ,  0.        ,  0.        ,  0.        ])