0%

分类分析因素关联性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from sklearn.metrics import r2_score
from sklearn import linear_model
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pylab import *

'''
文件用pandas读入命名为df
colume是待分类的一列
label是待分类的列表,如我想分析data.csv文件夹下,的ads_ele下的x和Mg不同情况下的时候吸附能与一些变量的关系
y是想分析的因变量,如吸附能
x是想分析的自变量
'''

def classify_scatter(filename,column,y_goal,x_goal,labels):
plt.figure(figsize=(8.3,6))
label_font = {'family':'Arial','weight':'normal','size':26}
legend_font = {'family':'Arial','weight':'normal','size':15}
plt.rcParams['xtick.direction'] = 'in'#将x周的刻度线方向设置向内
plt.rcParams['ytick.direction'] = 'in'#将y轴的刻度方向设置向内
plt.yticks(fontproperties = 'Arial', size = 24)
plt.xticks(fontproperties = 'Arial', size = 20)
plt.ylabel(y_goal,label_font)
plt.xlabel(x_goal,label_font)
minorticks_on()
tick_params(which='major',width=2,length=6)
tick_params(which='minor',width=2,length=4)
bwith = 2
ax=plt.gca()
ax.spines['bottom'].set_linewidth(bwith)
ax.spines['top'].set_linewidth(bwith)
ax.spines['left'].set_linewidth(bwith)
ax.spines['right'].set_linewidth(bwith)

for label in labels:
df = pd.read_excel(filename)
df = df[df[column] == str(label)]
y = np.array(df[y_goal]).reshape(-1,1)
x = np.array(df[x_goal]).reshape(-1,1)

linear = linear_model.LinearRegression()
linear.fit(x, y)

y_true = y
y_pred = linear.predict(x)
plt.scatter(x, y,label=label+':$R^2$='+str(round(r2_score(y_true,y_pred),3)),s=160,edgecolor='black',alpha=0.8)
x_aixs = np.linspace(np.min(x)-0.1*(np.max(x)-np.min(x)),1.1*np.max(x)+0.3*(np.max(x)-np.min(x)),10, endpoint=True).reshape(-1,1)
plt.plot(x_aixs, linear.predict(x_aixs), ls='dashed')
plt.legend(loc='best',prop ={'family':'Arial','weight':'normal','size':14},fancybox=False,edgecolor='black')
plt.tight_layout()
plt.savefig(y_goal+' VS. '+x_goal+' sorted by '+column+'.pdf',dpi=300)
print(label+'的R2:{}'.format(r2_score(y_true,y_pred)))

filename = 'raw.xlsx'
for c in ['Spacegroup','type','ads_ele']:
column = c
df = pd.read_excel(filename)
# 将这一列的所有行分离出来,并删除数据小于5的点
temp = df[column].value_counts()
labels = temp[temp>4].index.tolist()
# 目标变量和自变量
y_analysis = 'Ads_En'
for i in ['Bader','Workfunction', 'D -0']:
x_analysis = i
classify_scatter(filename,column,y_analysis,x_analysis,labels)