python - 为什么不同的初始点会导致套索优化的结果不同(凸的)？

我正在尝试对具有950个样本和大约5000个功能的数据使用套索优化。套索函数是$（1 /（2 * numberofsamples））* || y-Xw || ^ 2_2 + alpha * || w || _1 $。一旦尝试用初始化最小化，我就会得到完全不同的w这很奇怪，因为套索是凸的，并且初始化不应影响结果。这是带有和不带有初始化的套索的结果。 tol是公差。如果w的变化成为波纹管公差，则表明收敛。

tol=0.00000001
#####  lasso model errors  #####


gene: 5478 matrix error: 0.069611732213
with initialization: alpha: 1e-20 promotion: -3.58847815733e-13
coef: [-0.00214732 -0.00509795  0.00272167 -0.00651548 -0.00164646 -0.00115342
  0.00553346  0.01047653  0.00139832]
without initialization: alpha: 1e-20  promotion: -19.0735249749
coef: [-0.03650629  0.08992003 -0.01287155  0.03203973  0.1567577  -0.03708655
-0.13710957 -0.01252736 -0.21710334]


with initialization: alpha: 1e-15 promotion: 1.06179081478e-10
coef: [-0.00214732 -0.00509795  0.00272167 -0.00651548 -0.00164646 -0.00115342
  0.00553346  0.01047653  0.00139832]
without initialization: alpha: 1e-15  promotion: -19.0735249463
coef: [-0.03650629  0.08992003 -0.01287155  0.03203973  0.1567577  -0.03708655
-0.13710957 -0.01252736 -0.21710334]



Warning (from warnings module):
  File "/usr/local/lib/python2.7/site-packages/sklearn/linear_model/coordinate_descent.py", line 491
    ConvergenceWarning)
ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
with initialization: alpha: 1e-10  promotion: 0.775144987537
coef: [-0.00185139 -0.0048819   0.00218349 -0.00622618 -0.00145647 -0.00115857
  0.0055919   0.01072924  0.00043773]
without initialization: alpha: 1e-10 promotion: -17.8649603301
coef: [-0.03581581  0.0892119  -0.01232829  0.03151441  0.15606195 -0.03734093
-0.13604286 -0.01247732 -0.21233529]


with initialization: alpha: 1e-08 promotion: -5.87121366314
coef: [-0.          0.         -0.         -0.01064477  0.         -0.00116167
-0.          0.01114746  0.        ]
without initialization: alpha: 1e-08  promotion: 4.05593555389
coef: [ 0.          0.04505117  0.00668611  0.          0.07731668 -0.03537848
-0.03151995  0.         -0.00310122]


max promote:
4.05593555389

对于实现，我使用了python包sklearn.linear_model的套索函数。我也更改了数据，但是新数据的结果也随初始化而改变。我认为这很奇怪，但是我无法对其进行分析并找到解释。

这是我的代码的一部分，与套索有关。我的数据是基因表达。我在规范化和非规范化数据上测试代码。在他们两个人身上，最初的观点都不同。

    alpha_lasso = [1e-20,1e-15, 1e-10, 1e-8, 1e-7,1e-6,1e-5,1e-4, 1e-3,1e-2, 1, 5 ,20]

    lassoreg = Lasso(alpha=alpha_lasso[i],warm_start=True,tol=0.00000001,max_iter=100000)
    lassoreg.coef_ = mybeta[:,j-c]
    lassoreg.fit(train[:,predictors],train[:,y])
    y_train_pred = lassoreg.predict(A)#train[:,predictors])
    y_test_pred = lassoreg.predict(C)#test[:,predictors])

这也是我的整个代码：

import pandas as pd
import random
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
from GEOparse.GEOTypes import (GSE, GSM, GPL, GDS,
                               GDSSubset, GEODatabase,
                               DataIncompatibilityException,
                               NoMetadataException,
                               )
import GEOparse as GEO
import numpy as np
import copy
import sys
import math
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLars
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.linear_model import coordinate_descent
from sklearn.linear_model import lasso_path, enet_path


import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from copy import deepcopy

miss_percent = 0.1
alpha_lasso = [1e-20,1e-15, 1e-10, 1e-8, 1e-7,1e-6,1e-5,1e-4, 1e-3,1e-2, 1, 5 ,20]
mins=[]
maxs=[]
mean_err=[]
alphas=[]

mins1=[]
maxs1=[]
mean_err1=[]
alphas1=[]

#mnist = input_data.read_data_sets('../../MNIST_data', one_hot=True)
def getdata(percent):
    gsd = GEO.get_GEO(geo="GDS4971")
    ngsd = gsd.table.replace('null', np.NaN)
    ngsd = ngsd.dropna(axis=0, how='any')
    ngsd =ngsd.transpose()
    dataarray = ngsd.values
    data = np.delete(dataarray, [0,1], 0)

    x = data.astype(np.float)
    r_df = x.shape[0]
    c_df = x.shape[1]
    r = int(r_df-math.sqrt((1-percent)*r_df))
    c = int(c_df-math.sqrt((1-percent)*c_df))
    train = x[0:r,:]
    test = x[r:r_df,:]
    return x,train,test,r_df,c_df,r,c


genedata,train,test,r_df,c_df,r,c = getdata(miss_percent)
predictors = range(0,c)

promotion =[[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
promotion = np.asmatrix(promotion)
#error of ax-b
error_aw_b = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_aw_b = np.asmatrix(error_aw_b)
#error of cw-x
error_cw_x = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_cw_x = np.asmatrix(error_cw_x)
#error of lasso function
error_lasso = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_lasso = np.asmatrix(error_lasso)

promotion1 =[[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
promotion1 = np.asmatrix(promotion)
#error of ax-b
error_aw_b1 = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_aw_b1 = np.asmatrix(error_aw_b)
#error of cw-x
error_cw_x1 = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_cw_x1 = np.asmatrix(error_cw_x)
#error of lasso function
error_lasso1 = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_lasso1 = np.asmatrix(error_lasso)


mybeta = #any initialization

######################              LASSO              #####################
print("#####  lasso model errors  #####")
for j in range(c,c+1):
    mean_err=[]
    print("\n")
    y=j
    eachMeanError= math.sqrt((np.power(errorC[:,j-c],2)).sum()/(r_df-r))
    print("gene: "+str(j)+ " matrix error: "+ str(eachMeanError))
    for i in range(0,4):#len(alpha_lasso)):
        lassoreg = Lasso(alpha=alpha_lasso[i],warm_start=True,tol=0.00000001,max_iter=100000)
        lassoreg.coef_ = mybeta[:,j-c]
        lassoreg.fit(train[:,predictors],train[:,y])
        y_train_pred = lassoreg.predict(A)#train[:,predictors])
        y_test_pred = lassoreg.predict(C)#test[:,predictors])
        y_lasso_func = (1/(2*r))*sum(y_train_pred)+sum(abs(lassoreg.coef_))
        ##################      RMS     ##################
        error_aw_b[j-c,i] = math.sqrt(sum((y_train_pred-train[:,y])**2)/r)
        error_lasso[j-c,i] = y_lasso_func
        error_cw_x[j-c,i] = math.sqrt(sum((y_test_pred-test[:,y])**2)/(r_df-r))

        mins.extend([(error_cw_x.min())])
        maxs.extend([(error_cw_x.max())])

        promotion[j-c,i] = (((eachMeanError-error_cw_x[j-c,i])/eachMeanError)*100)
        print("alpha: "+str(alpha_lasso[i])+ " error_aw_b: "+str(error_aw_b[j-c,i]) + " error_cw_x: " + str(error_cw_x[j-c,i])+" error_lasso: "+str(error_lasso[j-c,i]) + " promotion: " + str(promotion[j-c,i]) )
        print("coef: " + str(lassoreg.coef_[1:10]))

        lassoreg1 = Lasso(alpha=alpha_lasso[i],tol=0.00000001,max_iter=100000)
        lassoreg1.fit(train[:,predictors],train[:,y])
        y_train_pred1 = lassoreg1.predict(A)#train[:,predictors])
        y_test_pred1 = lassoreg1.predict(C)#test[:,predictors])
        y_lasso_func1 = (1/(2*r))*sum(y_train_pred1)+sum(abs(lassoreg1.coef_))
        ##################      RMS     ##################
        error_aw_b1[j-c,i] = math.sqrt(sum((y_train_pred1-train[:,y])**2)/r)
        error_lasso1[j-c,i] = y_lasso_func1
        error_cw_x1[j-c,i] = math.sqrt(sum((y_test_pred1-test[:,y])**2)/(r_df-r))
        mins1.extend([(error_cw_x1.min())])
        maxs1.extend([(error_cw_x1.max())])

        promotion1[j-c,i] = (((eachMeanError-error_cw_x1[j-c,i])/eachMeanError)*100)
        print("alpha: "+str(alpha_lasso[i])+ " error_aw_b: "+str(error_aw_b1[j-c,i]) + " error_cw_x: " + str(error_cw_x1[j-c,i])+" error_lasso: "+str(error_lasso1[j-c,i]) + " promotion: " + str(promotion1[j-c,i]) )
        print("coef: " + str(lassoreg1.coef_[1:10]))
        print("\n")
    print("max promote:")
    print((promotion[j-c,:].max()))

f = open('analyse_col', 'wb')
np.save(f, [promotion,alphas,error_cw_x,mins,maxs])
f.close()

plt.plot(promotion[:,j-c])
plt.ylabel('coef for ')
plt.xlabel('each gene')
plt.show()

最佳答案

使用M=950 , N=5000，您可以获得M个样本和N个特征。

这里的要点是：但是当p> n时，套索准则不是严格凸的，因此它可能没有唯一的最小值。 reference。

这会使优化复杂化一点（请记住：这并不是所有问题中最简单的，因为本质上不平滑！），大多数求解器将针对其他情况进行调整。

在您的情况下，有一个明确的警告和建议：增加迭代次数！并确保您的Alpha值不要太小。不知道如何初始化后者，但是如果这些1e-15量级是手工制作的，请重新考虑问题的表述方式！

警告足以使这些解决方案不被视为最佳解决方案（因此：我的套索针对不同的init具有不同的解决方案在技术上是不正确的；仅您近似的解决方案具有这种行为）。

关于python - 为什么不同的初始点会导致套索优化的结果不同(凸的)？，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/46390193/