分类分析因素关联性

from sklearn.metrics import r2_score
from sklearn import linear_model
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
from pylab import *

'''
文件用pandas读入命名为df
colume是待分类的一列
label是待分类的列表，如我想分析data.csv文件夹下，的ads_ele下的x和Mg不同情况下的时候吸附能与一些变量的关系
y是想分析的因变量，如吸附能
x是想分析的自变量
'''

def classify_scatter(filename,column,y_goal,x_goal,labels):
    plt.figure(figsize=(8.3,6)) 
    label_font = {'family':'Arial','weight':'normal','size':26}
    legend_font = {'family':'Arial','weight':'normal','size':15}
    plt.rcParams['xtick.direction'] = 'in'#将x周的刻度线方向设置向内
    plt.rcParams['ytick.direction'] = 'in'#将y轴的刻度方向设置向内
    plt.yticks(fontproperties = 'Arial', size = 24)
    plt.xticks(fontproperties = 'Arial', size = 20)
    plt.ylabel(y_goal,label_font)
    plt.xlabel(x_goal,label_font)
    minorticks_on()
    tick_params(which='major',width=2,length=6)
    tick_params(which='minor',width=2,length=4)
    bwith = 2
    ax=plt.gca()
    ax.spines['bottom'].set_linewidth(bwith)    
    ax.spines['top'].set_linewidth(bwith)
    ax.spines['left'].set_linewidth(bwith)  
    ax.spines['right'].set_linewidth(bwith)

    for label in labels:
        df = pd.read_excel(filename)
        df = df[df[column] == str(label)]
        y = np.array(df[y_goal]).reshape(-1,1)
        x = np.array(df[x_goal]).reshape(-1,1)

        linear = linear_model.LinearRegression()
        linear.fit(x, y)

        y_true = y
        y_pred = linear.predict(x)
        plt.scatter(x, y,label=label+':$R^2$='+str(round(r2_score(y_true,y_pred),3)),s=160,edgecolor='black',alpha=0.8)
        x_aixs = np.linspace(np.min(x)-0.1*(np.max(x)-np.min(x)),1.1*np.max(x)+0.3*(np.max(x)-np.min(x)),10, endpoint=True).reshape(-1,1)
        plt.plot(x_aixs, linear.predict(x_aixs), ls='dashed')
        plt.legend(loc='best',prop ={'family':'Arial','weight':'normal','size':14},fancybox=False,edgecolor='black')
        plt.tight_layout()
        plt.savefig(y_goal+' VS. '+x_goal+' sorted by '+column+'.pdf',dpi=300)
        print(label+'的R2:{}'.format(r2_score(y_true,y_pred)))

filename = 'raw.xlsx'
for c in ['Spacegroup','type','ads_ele']:
    column = c
    df = pd.read_excel(filename)
# 将这一列的所有行分离出来,并删除数据小于5的点
    temp = df[column].value_counts()
    labels = temp[temp>4].index.tolist()
# 目标变量和自变量
    y_analysis = 'Ads_En'
    for i in ['Bader','Workfunction',  'D -0']:
        x_analysis = i
        classify_scatter(filename,column,y_analysis,x_analysis,labels)