关注小程序 找一找教程网-随时随地学编程

机器学习

机器学习-kaggle超市客户分类

#本次采用Kmans分析数据
import pandas as pd
import numpy as npp
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import matplotlib
import warnings
warnings.filterwarnings('ignore')
#设置全部列显示和浮点数格式
#pd.set_option
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)
pd.set_option('display.float_format',lambda x:'%.6f'%x)
#读取数据并重命名
df = pd.read_csv('Mall_Customers.csv')
df=df.rename(columns={'CustomerID':'顾客编号','Genre':'性别','Age':'年纪','Annual Income (k$)':'年收入','Spending Score (1-100)':'消费分数'})
df.性别.replace(['Male','Female'],[1,0],inplace=True)
#数据处理
#数据不多200条,粗略一看无缺失值,当可以用isnull查看
#对缺失值处理
df.isnull().sum()
#计算出数据的均值和标准差
dfms=pd.concat([df.mean().to_frame(),df.std().to_frame()],axis=1).transpose()
dfms.index=['mean','std']
#数据标准化
df_scaled=pd.DataFrame()
for i in df.columns:
    if (i=='性别'): df_scaled[i]=df[i]
    else:
        df_scaled[i]=(df[i] - dfms.loc['mean', i]) / dfms.loc['std', i]
df_scaled
#按照男女划分
dff=df_scaled.loc[df_scaled.性别==0].iloc[:,1:]
dfm=df_scaled.loc[df_scaled.性别==1].iloc[:,1:]
#选质心最优解
def numbers_of_clusters(df):
    demo = []
    for i in range(1,20):
        km=KMeans(n_clusters=i,random_state=158)
        km.fit(df)
        demo.append(km.inertia_)#用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数
    df_ = pd.DataFrame(demo).reset_index() #重新设置表格
    df_.columns=['n_clusters','within_cluster_sum_of_square']
    return df_
    
#生成质心点
df_final = numbers_of_clusters(dff)
df_final = numbers_of_clusters(dfm)
df_final
#分别绘制折线图,选择方差最小值对应的质心点数。最终选择5作为数据的质心点数
plt.subplot(1,2,1) #画布
matplotlib.rcParams['font.family']='SimHei' # 用来显示正常中文标签
matplotlib.rcParams['figure.figsize']=(16,10)
matplotlib.rcParams['font.size']=12
plt.plot(df_final.n_clusters,df_final.within_cluster_sum_of_square)
plt.xticks(range(1,19,1))
plt.title('Female')
plt.scatter(x=df_final.n_clusters[5:6],y=df_final.within_cluster_sum_of_square[5:6],color='black',marker='*')

plt.subplot(1,2,2)
matplotlib.rcParams['font.family']='SimHei'
matplotlib.rcParams['figure.figsize']=(16,6)
matplotlib.rcParams['font.size']=12
plt.plot(df_final.n_clusters,df_final.within_cluster_sum_of_square)
plt.xticks(range(1,19,1))
plt.title('Male')
plt.scatter(x=df_final.n_clusters[5:6],y=df_final.within_cluster_sum_of_square[5:6],color='black',marker='*')
#客户分类
def k_means(n_clusters,df,gender):
    kmf=KMeans(n_clusters=n_clusters,random_state=0)
    kmf.fit(df)
    centroids=kmf.cluster_centers_ #查看质心
    cdf=pd.DataFrame(centroids,columns=df.columns)
    cdf['性别']=gender
    cdf['count']=pd.Series(kmf.labels_).value_counts()
    return cdf

df1=k_means(5,dfm,'Male')
df2=k_means(5,dff,'Female')
dfc_scaled=pd.concat([df1,df2],axis=0)
dfc_scaled
#数据非标准化
dfc=pd.DataFrame()
for i in dfc_scaled.columns:
    if (i=='性别'):dfc[i]=dfc_scaled[i]
    elif (i=='count'):dfc[i]=dfc_scaled[i]
    else:
        dfc[i]=(dfc_scaled[i]*dfms.loc['std',i]+dfms.loc['mean',i])
        dfc[i]=dfc[i].astype(int)
dfc
#分类
dfc['type']=1
a_i=dfms.loc['mean']['年收入']
s_s=dfms.loc['mean']['消费分数']
dfcm=dfc[dfc['性别']=='Male']
dfcf=dfc[dfc['性别']=='Female']
remark=['年长/有孩子的收入一般的潜在男性客户','中年/有孩子的收入较高的优质男客户','年轻的收入一般的潜力男客户','年长/有孩子的收入较低的男客户','中年/有孩子的收入较高的潜在男客户']
dfcm['type']=pd.Series(remark)
remark=['年长/有孩子的收入一般的潜在女性客户','年轻的收入一般的潜力女客户','中年/有孩子的收入较高的优质女客户','年轻的收入较低的可发展女客户','中年/有孩子的收入较高的一般女客户']
dfcf['type']=pd.Series(remark)
dfcm
dfcf