Getting Started¶
Data load¶
[1]:
#Import package
import numpy as np
import pandas as pd
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
[2]:
#If using .json file
from pyspark.sql.functions import *
from pyspark.sql.types import *
sch = StructType([StructField('V1',DoubleType()), StructField('V2',DoubleType()), StructField('V3',DoubleType()), StructField('V4',DoubleType()), StructField('V5',DoubleType()), StructField('V6',DoubleType()), StructField('V7',DoubleType()), StructField('V8',DoubleType()), StructField('V9',DoubleType()), StructField('V10',DoubleType()), StructField('V11',DoubleType()), StructField('V12',DoubleType()), StructField('V13',DoubleType()), StructField('V14',DoubleType()), StructField('V15',DoubleType()), StructField('V16',DoubleType()), StructField('V17',DoubleType()), StructField('V18',DoubleType()), StructField('V19',DoubleType()), StructField('V20',DoubleType()), StructField('V21',DoubleType()), StructField('V22',DoubleType()), StructField('V23',DoubleType()), StructField('V24',DoubleType()), StructField('V25',DoubleType()), StructField('V26',DoubleType()), StructField('V27',DoubleType()), StructField('V28',DoubleType()), StructField('V29',DoubleType()), StructField('V30',DoubleType()), StructField('V31',DoubleType()), StructField('V32',DoubleType()), StructField('V33',DoubleType()), StructField('V34',DoubleType()), StructField('V35',DoubleType()), StructField('V36',DoubleType()), StructField('V37',DoubleType()), StructField('V38',DoubleType()), StructField('V39',DoubleType()), StructField('V40',DoubleType()), StructField('V41',DoubleType()), StructField('V42',DoubleType()), StructField('V43',DoubleType()), StructField('V44',DoubleType()), StructField('V45',DoubleType()), StructField('V46',DoubleType()), StructField('V47',DoubleType()), StructField('V48',DoubleType()), StructField('V49',DoubleType()), StructField('V50',DoubleType()), StructField('V51',DoubleType()), StructField('V52',DoubleType()), StructField('V53',DoubleType()), StructField('V54',DoubleType()), StructField('V55',DoubleType()), StructField('V56',DoubleType()), StructField('V57',DoubleType()), StructField('V58',DoubleType()), StructField('V59',DoubleType()), StructField('V60',DoubleType()), StructField('V61',DoubleType()), StructField('V62',DoubleType()), StructField('V63',DoubleType()), StructField('V64',DoubleType()), StructField('V65',DoubleType()), StructField('V66',DoubleType()), StructField('V67',DoubleType()), StructField('V68',DoubleType()), StructField('V69',DoubleType()), StructField('V70',DoubleType()), StructField('V71',DoubleType()), StructField('V72',DoubleType()), StructField('V73',DoubleType()), StructField('V74',DoubleType()), StructField('V75',DoubleType()), StructField('V76',DoubleType()), StructField('V77',DoubleType()), StructField('V78',DoubleType()), StructField('V79',DoubleType()), StructField('V80',DoubleType()), StructField('V81',DoubleType()), StructField('V82',DoubleType()), StructField('V83',DoubleType()), StructField('V84',DoubleType()), StructField('V85',DoubleType()), StructField('V86',DoubleType()), StructField('V87',DoubleType()), StructField('V88',DoubleType()), StructField('V89',DoubleType()), StructField('V90',DoubleType()), StructField('V91',DoubleType()), StructField('V92',DoubleType()), StructField('V93',DoubleType()), StructField('V94',DoubleType()), StructField('V95',DoubleType()), StructField('V96',DoubleType()), StructField('V97',DoubleType()), StructField('V98',DoubleType()), StructField('V99',DoubleType()), StructField('V100',DoubleType())
])
[3]:
X_Sparkdataframe = spark.read.schema(sch).json("simulation_data_x.json")
X_Sparkdataframe = X_Sparkdataframe.select(*(col(c).cast("float").alias(c) for c in X_Sparkdataframe.columns))
X = X_Sparkdataframe.toPandas()
y_Sparkdataframe = spark.read.json("simulation_data_y.json")
y_Sparkdataframe = y_Sparkdataframe.select(*(col(c).cast("float").alias(c) for c in y_Sparkdataframe.columns))
y = y_Sparkdataframe.toPandas()
[4]:
X.head()
[4]:
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | ... | V91 | V92 | V93 | V94 | V95 | V96 | V97 | V98 | V99 | V100 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.101741 | 0.144909 | 0.235567 | 0.576186 | 0.299443 | 0.296395 | 0.902235 | 0.265811 | 0.420927 | 0.684045 | ... | -0.058677 | 0.497420 | -1.124986 | 0.338215 | -0.942943 | -1.257044 | -0.531471 | 1.236317 | 0.405682 | 0.387636 |
1 | -0.105054 | -0.110128 | -0.033311 | -0.042925 | -0.752605 | -0.794815 | -1.699739 | -1.891533 | -1.287547 | -1.154547 | ... | -1.614948 | -1.337878 | 0.795742 | 1.101117 | -0.920702 | -0.098002 | -0.269719 | 0.333092 | -0.500367 | 1.340876 |
2 | -0.478922 | -0.058475 | -0.620625 | 1.775435 | 0.935760 | 1.268173 | -2.652118 | -2.327299 | -2.913926 | -2.140256 | ... | 0.081620 | 0.999657 | -0.594758 | 0.057804 | -1.259650 | 0.321864 | 0.992930 | 0.552269 | 1.253700 | 0.974150 |
3 | -0.657057 | -0.508105 | -0.556453 | 0.057045 | -0.344814 | -0.557824 | -0.814844 | 0.016355 | -0.384234 | -0.022224 | ... | 0.077500 | -1.489750 | -0.151010 | 0.347814 | 2.281268 | -1.275026 | -0.141539 | -0.335557 | 0.196004 | -1.347782 |
4 | -0.939275 | -0.780565 | -0.495687 | -0.308105 | -0.604543 | -0.272711 | 0.080763 | 0.542582 | 0.580669 | 0.162638 | ... | 0.677662 | 0.188278 | 0.716616 | -1.290398 | -0.579556 | -0.692827 | -1.040820 | -0.674525 | 1.355343 | 1.754870 |
5 rows × 100 columns
[5]:
y.head()
[5]:
V1 | |
---|---|
0 | 2.811113 |
1 | 1.049249 |
2 | -4.496389 |
3 | -3.846408 |
4 | -2.805357 |
General Usage¶
[6]:
from Hi_LASSO_pyspark import HiLASSO_Spark
model = HiLASSO_Spark(X, y, alpha=0.05, q1='auto', q2='auto', L=30, cv=5, node='auto', logistic=False)
model.fit()
C:\Users\Seungha\anaconda3\lib\site-packages\sklearn\externals\joblib\__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
warnings.warn(msg, category=FutureWarning)
C:\Users\Seungha\anaconda3\lib\site-packages\sklearn\externals\six.py:31: FutureWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
"(https://pypi.org/project/six/).", FutureWarning)
Procedure_1_fin.
Procedure_2_fin.
[6]:
<Hi_LASSO_pyspark.HiLASSO_Spark at 0x1b7628c91c8>
[7]:
model.coef_
[7]:
array([ 9.07767822e-01, 1.84831819e+00, -1.50886668e-01, 6.51613666e-01,
3.60881908e-01, -5.43158333e-02, 1.10656217e+00, 5.76287223e-01,
1.45350133e-01, 1.12891593e-02, -4.92306272e-01, 4.00331044e-03,
-2.92755363e-02, 0.00000000e+00, -5.97072304e-01, -4.01800075e-03,
-1.85472135e-02, 1.50973881e-02, 0.00000000e+00, 1.00734445e-03,
1.82650706e-01, -2.44372503e-02, 0.00000000e+00, 4.87803321e-04,
2.12434938e-01, -1.35364354e-03, 6.26210409e-03, 7.30404076e-02,
0.00000000e+00, 4.32746612e-02, 6.84953978e-04, 0.00000000e+00,
5.08000749e-02, 4.16104086e-01, 0.00000000e+00, 0.00000000e+00,
6.12355843e-05, -2.15697968e-03, 0.00000000e+00, 0.00000000e+00,
8.90277566e-02, 0.00000000e+00, 4.82355939e-01, 0.00000000e+00,
-6.06668658e-02, 4.24659000e-02, -1.41782146e-02, -3.61172307e-03,
0.00000000e+00, -1.65201959e-04, 0.00000000e+00, -1.42850442e-02,
-1.45138412e-01, 0.00000000e+00, 9.37401342e-05, 0.00000000e+00,
4.69654532e-03, 2.33781070e-02, -1.21564959e-01, 1.19742376e-01,
4.37066230e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-7.42586867e-03, -7.69866705e-04, 4.79935847e-03, -8.54272613e-03,
-4.44192587e-04, 0.00000000e+00, 6.32937349e-03, 1.72394695e-01,
1.89504145e-02, 0.00000000e+00, 4.31866689e-03, 0.00000000e+00,
2.51616680e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-2.12432332e-01, 1.61906818e-01, 1.70911957e-01, 0.00000000e+00,
5.26630954e-04, 0.00000000e+00, -1.13084992e+00, -8.78953629e-02,
-2.32628707e-03, 0.00000000e+00, 3.15969130e-03, 0.00000000e+00,
-6.36326525e-04, 1.61205647e-01, 0.00000000e+00, 5.21366390e-02,
2.89292450e-03, 0.00000000e+00, -5.86326872e-02, 2.83126071e-02])
[8]:
model.p_values
[8]:
array([7.53704419e-089, 1.50423843e-183, 7.54534150e-001, 9.43249973e-088,
2.82878036e-034, 9.98485520e-001, 7.92473484e-147, 3.05039555e-068,
6.11829481e-003, 1.00000000e+000, 1.65138684e-117, 1.00000000e+000,
9.45720852e-001, 1.00000000e+000, 4.57443521e-091, 1.00000000e+000,
1.00000000e+000, 9.99999998e-001, 1.00000000e+000, 1.00000000e+000,
1.27876186e-005, 9.99999540e-001, 1.00000000e+000, 1.00000000e+000,
5.63291884e-020, 1.00000000e+000, 1.00000000e+000, 1.13863358e-001,
1.00000000e+000, 7.54534150e-001, 1.00000000e+000, 1.00000000e+000,
5.62464251e-001, 3.12847085e-079, 1.00000000e+000, 1.00000000e+000,
1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
5.00176690e-004, 1.00000000e+000, 1.90247524e-094, 1.00000000e+000,
2.85225204e-004, 6.30455341e-001, 9.99233039e-001, 1.00000000e+000,
1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 9.99998548e-001,
1.63646066e-007, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
1.00000000e+000, 9.99999865e-001, 4.69500947e-005, 6.18737269e-009,
1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 9.99999998e-001,
1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.75747652e-010,
9.94847160e-001, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
9.99988256e-001, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
1.47539558e-054, 7.77001029e-016, 1.38617488e-012, 1.00000000e+000,
9.99969672e-001, 1.00000000e+000, 4.20762516e-215, 1.44173110e-008,
1.00000000e+000, 1.00000000e+000, 9.99999998e-001, 1.00000000e+000,
1.00000000e+000, 2.37906131e-015, 1.00000000e+000, 9.49056045e-003,
1.00000000e+000, 1.00000000e+000, 3.58419958e-001, 9.99995730e-001])
[9]:
model.selected_var
[9]:
array([ 0.90776782, 1.84831819, 0. , 0.65161367, 0.36088191,
0. , 1.10656217, 0.57628722, 0. , 0. ,
-0.49230627, 0. , 0. , 0. , -0.5970723 ,
0. , 0. , 0. , 0. , 0. ,
0.18265071, 0. , 0. , 0. , 0.21243494,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.41610409, 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.48235594, 0. , -0.06066687,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , -0.14513841, 0. , 0. ,
0. , 0. , 0. , -0.12156496, 0.11974238,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0.17239469, 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
-0.21243233, 0.16190682, 0.17091196, 0. , 0. ,
0. , -1.13084992, -0.08789536, 0. , 0. ,
0. , 0. , 0. , 0.16120565, 0. ,
0. , 0. , 0. , 0. , 0. ])