Getting Started

Data load

[1]:
#Import package
import numpy as np
import pandas as pd

from pyspark.sql.session import SparkSession

spark = SparkSession.builder.getOrCreate()
[2]:
#If using .json file
from pyspark.sql.functions import *
from pyspark.sql.types import *
sch = StructType([StructField('V1',DoubleType()), StructField('V2',DoubleType()), StructField('V3',DoubleType()), StructField('V4',DoubleType()), StructField('V5',DoubleType()), StructField('V6',DoubleType()), StructField('V7',DoubleType()), StructField('V8',DoubleType()), StructField('V9',DoubleType()), StructField('V10',DoubleType()), StructField('V11',DoubleType()), StructField('V12',DoubleType()), StructField('V13',DoubleType()), StructField('V14',DoubleType()), StructField('V15',DoubleType()), StructField('V16',DoubleType()), StructField('V17',DoubleType()), StructField('V18',DoubleType()), StructField('V19',DoubleType()), StructField('V20',DoubleType()), StructField('V21',DoubleType()), StructField('V22',DoubleType()), StructField('V23',DoubleType()), StructField('V24',DoubleType()), StructField('V25',DoubleType()), StructField('V26',DoubleType()), StructField('V27',DoubleType()), StructField('V28',DoubleType()), StructField('V29',DoubleType()), StructField('V30',DoubleType()), StructField('V31',DoubleType()), StructField('V32',DoubleType()), StructField('V33',DoubleType()), StructField('V34',DoubleType()), StructField('V35',DoubleType()), StructField('V36',DoubleType()), StructField('V37',DoubleType()), StructField('V38',DoubleType()), StructField('V39',DoubleType()), StructField('V40',DoubleType()), StructField('V41',DoubleType()), StructField('V42',DoubleType()), StructField('V43',DoubleType()), StructField('V44',DoubleType()), StructField('V45',DoubleType()), StructField('V46',DoubleType()), StructField('V47',DoubleType()), StructField('V48',DoubleType()), StructField('V49',DoubleType()), StructField('V50',DoubleType()), StructField('V51',DoubleType()), StructField('V52',DoubleType()), StructField('V53',DoubleType()), StructField('V54',DoubleType()), StructField('V55',DoubleType()), StructField('V56',DoubleType()), StructField('V57',DoubleType()), StructField('V58',DoubleType()), StructField('V59',DoubleType()), StructField('V60',DoubleType()), StructField('V61',DoubleType()), StructField('V62',DoubleType()), StructField('V63',DoubleType()), StructField('V64',DoubleType()), StructField('V65',DoubleType()), StructField('V66',DoubleType()), StructField('V67',DoubleType()), StructField('V68',DoubleType()), StructField('V69',DoubleType()), StructField('V70',DoubleType()), StructField('V71',DoubleType()), StructField('V72',DoubleType()), StructField('V73',DoubleType()), StructField('V74',DoubleType()), StructField('V75',DoubleType()), StructField('V76',DoubleType()), StructField('V77',DoubleType()), StructField('V78',DoubleType()), StructField('V79',DoubleType()), StructField('V80',DoubleType()), StructField('V81',DoubleType()), StructField('V82',DoubleType()), StructField('V83',DoubleType()), StructField('V84',DoubleType()), StructField('V85',DoubleType()), StructField('V86',DoubleType()), StructField('V87',DoubleType()), StructField('V88',DoubleType()), StructField('V89',DoubleType()), StructField('V90',DoubleType()), StructField('V91',DoubleType()), StructField('V92',DoubleType()), StructField('V93',DoubleType()), StructField('V94',DoubleType()), StructField('V95',DoubleType()), StructField('V96',DoubleType()), StructField('V97',DoubleType()), StructField('V98',DoubleType()), StructField('V99',DoubleType()), StructField('V100',DoubleType())
])
[3]:
X_Sparkdataframe = spark.read.schema(sch).json("simulation_data_x.json")
X_Sparkdataframe = X_Sparkdataframe.select(*(col(c).cast("float").alias(c) for c in X_Sparkdataframe.columns))
X = X_Sparkdataframe.toPandas()
y_Sparkdataframe = spark.read.json("simulation_data_y.json")
y_Sparkdataframe = y_Sparkdataframe.select(*(col(c).cast("float").alias(c) for c in y_Sparkdataframe.columns))
y = y_Sparkdataframe.toPandas()
[4]:
X.head()
[4]:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 ... V91 V92 V93 V94 V95 V96 V97 V98 V99 V100
0 0.101741 0.144909 0.235567 0.576186 0.299443 0.296395 0.902235 0.265811 0.420927 0.684045 ... -0.058677 0.497420 -1.124986 0.338215 -0.942943 -1.257044 -0.531471 1.236317 0.405682 0.387636
1 -0.105054 -0.110128 -0.033311 -0.042925 -0.752605 -0.794815 -1.699739 -1.891533 -1.287547 -1.154547 ... -1.614948 -1.337878 0.795742 1.101117 -0.920702 -0.098002 -0.269719 0.333092 -0.500367 1.340876
2 -0.478922 -0.058475 -0.620625 1.775435 0.935760 1.268173 -2.652118 -2.327299 -2.913926 -2.140256 ... 0.081620 0.999657 -0.594758 0.057804 -1.259650 0.321864 0.992930 0.552269 1.253700 0.974150
3 -0.657057 -0.508105 -0.556453 0.057045 -0.344814 -0.557824 -0.814844 0.016355 -0.384234 -0.022224 ... 0.077500 -1.489750 -0.151010 0.347814 2.281268 -1.275026 -0.141539 -0.335557 0.196004 -1.347782
4 -0.939275 -0.780565 -0.495687 -0.308105 -0.604543 -0.272711 0.080763 0.542582 0.580669 0.162638 ... 0.677662 0.188278 0.716616 -1.290398 -0.579556 -0.692827 -1.040820 -0.674525 1.355343 1.754870

5 rows × 100 columns

[5]:
y.head()
[5]:
V1
0 2.811113
1 1.049249
2 -4.496389
3 -3.846408
4 -2.805357

General Usage

[6]:
from Hi_LASSO_pyspark import HiLASSO_Spark

model = HiLASSO_Spark(X, y, alpha=0.05, q1='auto', q2='auto', L=30, cv=5, node='auto', logistic=False)
model.fit()
C:\Users\Seungha\anaconda3\lib\site-packages\sklearn\externals\joblib\__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=FutureWarning)
C:\Users\Seungha\anaconda3\lib\site-packages\sklearn\externals\six.py:31: FutureWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
  "(https://pypi.org/project/six/).", FutureWarning)
Procedure_1_fin.
Procedure_2_fin.
[6]:
<Hi_LASSO_pyspark.HiLASSO_Spark at 0x1b7628c91c8>
[7]:
model.coef_
[7]:
array([ 9.07767822e-01,  1.84831819e+00, -1.50886668e-01,  6.51613666e-01,
        3.60881908e-01, -5.43158333e-02,  1.10656217e+00,  5.76287223e-01,
        1.45350133e-01,  1.12891593e-02, -4.92306272e-01,  4.00331044e-03,
       -2.92755363e-02,  0.00000000e+00, -5.97072304e-01, -4.01800075e-03,
       -1.85472135e-02,  1.50973881e-02,  0.00000000e+00,  1.00734445e-03,
        1.82650706e-01, -2.44372503e-02,  0.00000000e+00,  4.87803321e-04,
        2.12434938e-01, -1.35364354e-03,  6.26210409e-03,  7.30404076e-02,
        0.00000000e+00,  4.32746612e-02,  6.84953978e-04,  0.00000000e+00,
        5.08000749e-02,  4.16104086e-01,  0.00000000e+00,  0.00000000e+00,
        6.12355843e-05, -2.15697968e-03,  0.00000000e+00,  0.00000000e+00,
        8.90277566e-02,  0.00000000e+00,  4.82355939e-01,  0.00000000e+00,
       -6.06668658e-02,  4.24659000e-02, -1.41782146e-02, -3.61172307e-03,
        0.00000000e+00, -1.65201959e-04,  0.00000000e+00, -1.42850442e-02,
       -1.45138412e-01,  0.00000000e+00,  9.37401342e-05,  0.00000000e+00,
        4.69654532e-03,  2.33781070e-02, -1.21564959e-01,  1.19742376e-01,
        4.37066230e-03,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -7.42586867e-03, -7.69866705e-04,  4.79935847e-03, -8.54272613e-03,
       -4.44192587e-04,  0.00000000e+00,  6.32937349e-03,  1.72394695e-01,
        1.89504145e-02,  0.00000000e+00,  4.31866689e-03,  0.00000000e+00,
        2.51616680e-02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -2.12432332e-01,  1.61906818e-01,  1.70911957e-01,  0.00000000e+00,
        5.26630954e-04,  0.00000000e+00, -1.13084992e+00, -8.78953629e-02,
       -2.32628707e-03,  0.00000000e+00,  3.15969130e-03,  0.00000000e+00,
       -6.36326525e-04,  1.61205647e-01,  0.00000000e+00,  5.21366390e-02,
        2.89292450e-03,  0.00000000e+00, -5.86326872e-02,  2.83126071e-02])
[8]:
model.p_values
[8]:
array([7.53704419e-089, 1.50423843e-183, 7.54534150e-001, 9.43249973e-088,
       2.82878036e-034, 9.98485520e-001, 7.92473484e-147, 3.05039555e-068,
       6.11829481e-003, 1.00000000e+000, 1.65138684e-117, 1.00000000e+000,
       9.45720852e-001, 1.00000000e+000, 4.57443521e-091, 1.00000000e+000,
       1.00000000e+000, 9.99999998e-001, 1.00000000e+000, 1.00000000e+000,
       1.27876186e-005, 9.99999540e-001, 1.00000000e+000, 1.00000000e+000,
       5.63291884e-020, 1.00000000e+000, 1.00000000e+000, 1.13863358e-001,
       1.00000000e+000, 7.54534150e-001, 1.00000000e+000, 1.00000000e+000,
       5.62464251e-001, 3.12847085e-079, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       5.00176690e-004, 1.00000000e+000, 1.90247524e-094, 1.00000000e+000,
       2.85225204e-004, 6.30455341e-001, 9.99233039e-001, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 9.99998548e-001,
       1.63646066e-007, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 9.99999865e-001, 4.69500947e-005, 6.18737269e-009,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 9.99999998e-001,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 1.75747652e-010,
       9.94847160e-001, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       9.99988256e-001, 1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
       1.47539558e-054, 7.77001029e-016, 1.38617488e-012, 1.00000000e+000,
       9.99969672e-001, 1.00000000e+000, 4.20762516e-215, 1.44173110e-008,
       1.00000000e+000, 1.00000000e+000, 9.99999998e-001, 1.00000000e+000,
       1.00000000e+000, 2.37906131e-015, 1.00000000e+000, 9.49056045e-003,
       1.00000000e+000, 1.00000000e+000, 3.58419958e-001, 9.99995730e-001])
[9]:
model.selected_var
[9]:
array([ 0.90776782,  1.84831819,  0.        ,  0.65161367,  0.36088191,
        0.        ,  1.10656217,  0.57628722,  0.        ,  0.        ,
       -0.49230627,  0.        ,  0.        ,  0.        , -0.5970723 ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.18265071,  0.        ,  0.        ,  0.        ,  0.21243494,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.41610409,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.48235594,  0.        , -0.06066687,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        , -0.14513841,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.12156496,  0.11974238,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.17239469,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.21243233,  0.16190682,  0.17091196,  0.        ,  0.        ,
        0.        , -1.13084992, -0.08789536,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.16120565,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])