Getting Started¶
Data load¶
[1]:
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark import SparkFiles
from pyspark.sql.functions import col
url_X = 'https://raw.githubusercontent.com/seunghajeong/data_repo/master/simulation_data_x.csv'
url_y = 'https://raw.githubusercontent.com/seunghajeong/data_repo/master/simulation_data_y.csv'
spark.sparkContext.addFile(url_X)
X_Sparkdataframe = spark.read.csv(SparkFiles.get("simulation_data_x.csv"), header=True)
X_Sparkdataframe = X_Sparkdataframe.select(*(col(c).cast("float").alias(c) for c in X_Sparkdataframe.columns))
X = X_Sparkdataframe.toPandas()
spark.sparkContext.addFile(url_y)
y_Sparkdataframe = spark.read.csv(SparkFiles.get("simulation_data_y.csv"), header=True)
y_Sparkdataframe = y_Sparkdataframe.select(*(col(c).cast("float").alias(c) for c in y_Sparkdataframe.columns))
y = y_Sparkdataframe.toPandas()
[3]:
X.head()
[3]:
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | ... | V91 | V92 | V93 | V94 | V95 | V96 | V97 | V98 | V99 | V100 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.101741 | 0.144909 | 0.235567 | 0.576186 | 0.299443 | 0.296395 | 0.902235 | 0.265811 | 0.420927 | 0.684045 | ... | -0.058677 | 0.497420 | -1.124986 | 0.338215 | -0.942943 | -1.257044 | -0.531471 | 1.236317 | 0.405682 | 0.387636 |
1 | -0.105054 | -0.110128 | -0.033311 | -0.042925 | -0.752605 | -0.794815 | -1.699739 | -1.891533 | -1.287547 | -1.154547 | ... | -1.614948 | -1.337878 | 0.795742 | 1.101117 | -0.920702 | -0.098002 | -0.269719 | 0.333092 | -0.500367 | 1.340876 |
2 | -0.478922 | -0.058475 | -0.620625 | 1.775435 | 0.935760 | 1.268173 | -2.652118 | -2.327299 | -2.913926 | -2.140256 | ... | 0.081620 | 0.999657 | -0.594758 | 0.057804 | -1.259650 | 0.321864 | 0.992930 | 0.552269 | 1.253700 | 0.974150 |
3 | -0.657057 | -0.508105 | -0.556453 | 0.057045 | -0.344814 | -0.557824 | -0.814844 | 0.016355 | -0.384234 | -0.022224 | ... | 0.077500 | -1.489750 | -0.151010 | 0.347814 | 2.281268 | -1.275026 | -0.141539 | -0.335557 | 0.196004 | -1.347782 |
4 | -0.939275 | -0.780565 | -0.495687 | -0.308105 | -0.604543 | -0.272711 | 0.080763 | 0.542582 | 0.580669 | 0.162638 | ... | 0.677662 | 0.188278 | 0.716616 | -1.290398 | -0.579556 | -0.692827 | -1.040820 | -0.674525 | 1.355343 | 1.754870 |
5 rows × 100 columns
[4]:
y.head()
[4]:
V1 | |
---|---|
0 | 2.811113 |
1 | 1.049249 |
2 | -4.496389 |
3 | -3.846408 |
4 | -2.805357 |
General Usage¶
[5]:
from Hi_LASSO_pyspark import HiLASSO_Spark
model = HiLASSO_Spark(X, y, alpha=0.05, q1='auto', q2='auto', L=30, cv=5, node='auto', logistic=False)
model.fit()
C:\Users\Seungha\anaconda3\lib\site-packages\sklearn\externals\joblib\__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
warnings.warn(msg, category=FutureWarning)
C:\Users\Seungha\anaconda3\lib\site-packages\sklearn\externals\six.py:31: FutureWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
"(https://pypi.org/project/six/).", FutureWarning)
Procedure_1_fin.
Procedure_2_fin.
[5]:
<Hi_LASSO_pyspark.HiLASSO_Spark at 0x172ece75f48>
[6]:
model.coef_
[6]:
array([ 1.15905047e+00, 1.73054066e+00, -2.78546358e-01, 7.08538060e-01,
2.93630644e-01, -3.09802967e-02, 9.87172481e-01, 4.98625094e-01,
2.80438607e-01, 2.04938322e-03, -5.33703582e-01, 0.00000000e+00,
-2.25060983e-02, 0.00000000e+00, -5.63347651e-01, 7.20865170e-03,
-1.05214712e-02, 1.45435365e-02, 0.00000000e+00, 8.20807070e-03,
2.18642615e-01, -5.03987466e-02, 0.00000000e+00, 7.34933400e-03,
1.43663963e-01, 0.00000000e+00, 6.34305539e-03, 2.00344988e-02,
3.13616708e-05, 6.35422045e-02, -2.01727289e-03, 0.00000000e+00,
4.78273926e-02, 4.29157582e-01, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, -1.40783507e-02, 0.00000000e+00, 0.00000000e+00,
8.87616324e-02, 0.00000000e+00, 3.38209725e-01, 0.00000000e+00,
-4.18006928e-02, 3.83894299e-02, -3.01721439e-02, 0.00000000e+00,
0.00000000e+00, -1.45920949e-03, 0.00000000e+00, -3.20830882e-02,
-1.49872298e-01, 0.00000000e+00, -3.98133525e-03, 0.00000000e+00,
2.45903335e-03, 7.68685032e-03, -5.65818992e-02, 1.12274264e-01,
2.13740191e-03, 0.00000000e+00, -4.18373709e-04, 0.00000000e+00,
-7.87575129e-04, 0.00000000e+00, 4.93090821e-02, 0.00000000e+00,
-4.18499004e-04, -3.56030163e-04, 3.92035324e-02, 2.35306927e-01,
1.71041040e-02, 0.00000000e+00, 1.03596675e-02, 0.00000000e+00,
2.09837938e-02, -2.43313781e-03, 0.00000000e+00, 0.00000000e+00,
-2.72681876e-01, 1.64381733e-01, 1.56982030e-01, 1.43836815e-03,
4.66357974e-03, 0.00000000e+00, -1.12394212e+00, -8.07932918e-02,
0.00000000e+00, 0.00000000e+00, -1.70966580e-03, 0.00000000e+00,
-2.23018886e-03, 1.26032087e-01, 0.00000000e+00, 1.03304962e-01,
0.00000000e+00, 0.00000000e+00, -5.00787433e-02, 8.02132494e-03])
[7]:
model.p_values
[7]:
array([5.45546950e-131, 2.01625779e-164, 1.31670825e-004, 8.80886752e-097,
4.31004255e-027, 9.99999051e-001, 8.02834634e-123, 6.37878173e-048,
6.27334431e-017, 1.00000000e+000, 1.70639215e-121, 1.00000000e+000,
9.99733947e-001, 1.00000000e+000, 6.12580122e-098, 1.00000000e+000,
1.00000000e+000, 9.99999704e-001, 1.00000000e+000, 1.00000000e+000,
1.31517681e-007, 8.32544869e-001, 1.00000000e+000, 1.00000000e+000,
2.85508681e-007, 1.00000000e+000, 1.00000000e+000, 9.99948594e-001,
1.00000000e+000, 5.20943089e-003, 1.00000000e+000, 1.00000000e+000,
1.85618624e-002, 3.49112719e-075, 1.00000000e+000, 1.00000000e+000,
1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
4.16256090e-004, 1.00000000e+000, 1.28847785e-054, 1.00000000e+000,
5.31842478e-001, 2.19719125e-001, 9.81054785e-001, 1.00000000e+000,
1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 9.34796624e-001,
2.85508681e-007, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
1.00000000e+000, 1.00000000e+000, 1.01690916e-001, 5.94884253e-008,
1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
1.00000000e+000, 1.00000000e+000, 8.73984401e-001, 1.00000000e+000,
1.00000000e+000, 1.00000000e+000, 9.70377124e-001, 7.00544469e-023,
9.99948594e-001, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
9.99997164e-001, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
2.73297075e-068, 8.03655110e-012, 1.39764626e-010, 1.00000000e+000,
1.00000000e+000, 1.00000000e+000, 4.97483677e-211, 2.64252727e-008,
1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
1.00000000e+000, 1.39764626e-010, 1.00000000e+000, 4.94110926e-009,
1.00000000e+000, 1.00000000e+000, 9.93078320e-001, 1.00000000e+000])
[8]:
model.selected_var
[8]:
array([ 1.15905047, 1.73054066, -0.27854636, 0.70853806, 0.29363064,
0. , 0.98717248, 0.49862509, 0.28043861, 0. ,
-0.53370358, 0. , 0. , 0. , -0.56334765,
0. , 0. , 0. , 0. , 0. ,
0.21864261, 0. , 0. , 0. , 0.14366396,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.42915758, 0. ,
0. , 0. , 0. , 0. , 0. ,
0.08876163, 0. , 0.33820973, 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , -0.1498723 , 0. , 0. ,
0. , 0. , 0. , 0. , 0.11227426,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0.23530693, 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
-0.27268188, 0.16438173, 0.15698203, 0. , 0. ,
0. , -1.12394212, -0.08079329, 0. , 0. ,
0. , 0. , 0. , 0.12603209, 0. ,
0.10330496, 0. , 0. , 0. , 0. ])