Python数据分析实战：从入门到精通

2026-03-26•编程技术

Python数据分析实战：从入门到精通

Python是数据分析领域最受欢迎的编程语言之一。本文将系统介绍Python数据分析的核心工具和实战技巧。

环境搭建

1. 安装Anaconda

# 下载Anaconda
wget https://repo.anaconda.com/archive/Anaconda3-2024.02-Linux-x86_64.sh

# 安装
bash Anaconda3-2024.02-Linux-x86_64.sh

# 创建虚拟环境
conda create -n data_analysis python=3.11

# 激活环境
conda activate data_analysis

2. 安装核心库

pip install pandas numpy matplotlib seaborn scipy scikit-learn jupyter

NumPy基础

1. 数组创建

import numpy as np

# 从列表创建
arr1 = np.array([1, 2, 3, 4, 5])

# 创建特定数组
zeros = np.zeros((3, 4))          # 3x4零矩阵
ones = np.ones((2, 3))            # 2x3全1矩阵
identity = np.eye(3)              # 3x3单位矩阵
random_arr = np.random.rand(3, 3) # 3x3随机矩阵

# 创建序列
arange_arr = np.arange(0, 10, 2)  # [0, 2, 4, 6, 8]
linspace_arr = np.linspace(0, 1, 5)  # [0, 0.25, 0.5, 0.75, 1]

# 创建多维数组
arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

2. 数组操作

# 数组属性
print(arr_2d.shape)    # (3, 3)
print(arr_2d.ndim)     # 2
print(arr_2d.size)     # 9
print(arr_2d.dtype)    # int64

# 索引和切片
print(arr_2d[0, 1])    # 2
print(arr_2d[0:2, 1:3])  # [[2, 3], [5, 6]]

# 布尔索引
mask = arr_2d > 5
print(arr_2d[mask])    # [6, 7, 8, 9]

# 数组运算
arr_a = np.array([1, 2, 3])
arr_b = np.array([4, 5, 6])

print(arr_a + arr_b)   # [5, 7, 9]
print(arr_a * arr_b)   # [4, 10, 18]
print(arr_a ** 2)      # [1, 4, 9]

# 广播
arr_c = np.array([[1, 2, 3], [4, 5, 6]])
arr_d = np.array([10, 20, 30])
print(arr_c + arr_d)   # [[11, 22, 33], [14, 25, 36]]

# 统计函数
print(np.mean(arr_2d))      # 平均值
print(np.std(arr_2d))       # 标准差
print(np.sum(arr_2d))       # 求和
print(np.max(arr_2d))       # 最大值
print(np.min(arr_2d))       # 最小值
print(np.argmax(arr_2d))    # 最大值的索引

3. 数组变形

arr = np.arange(12)

# reshape
reshaped = arr.reshape(3, 4)
print(reshaped)
# [[ 0  1  2  3]
#  [ 4  5  6  7]
#  [ 8  9 10 11]]

# 转置
transposed = reshaped.T

# 展平
flattened = reshaped.flatten()

# 拼接
arr1 = np.array([[1, 2], [3, 4]])
arr2 = np.array([[5, 6], [7, 8]])

vertical = np.vstack((arr1, arr2))    # 垂直拼接
horizontal = np.hstack((arr1, arr2))  # 水平拼接

Pandas数据处理

1. 数据结构

import pandas as pd

# Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

# DataFrame
df = pd.DataFrame({
    'A': 1.0,
    'B': pd.Timestamp('20240101'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo'
})

# 从字典创建
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 35, 28],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston'],
    'salary': [50000, 60000, 75000, 65000]
}
df = pd.DataFrame(data)

# 从CSV读取
df = pd.read_csv('data.csv')

# 从Excel读取
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# 从SQL读取
import sqlite3
conn = sqlite3.connect('database.db')
df = pd.read_sql_query("SELECT * FROM users", conn)

2. 数据查看

# 查看前几行
print(df.head())
print(df.head(10))

# 查看后几行
print(df.tail())

# 查看基本信息
print(df.info())
print(df.describe())

# 查看数据形状
print(df.shape)

# 查看列名
print(df.columns)

# 查看索引
print(df.index)

# 查看数据类型
print(df.dtypes)

# 查看统计信息
print(df.describe(include='all'))

3. 数据选择

# 选择列
print(df['name'])           # 单列
print(df[['name', 'age']])  # 多列

# 选择行
print(df.loc[0])            # 按标签
print(df.iloc[0])           # 按位置
print(df[0:3])              # 切片

# 条件选择
print(df[df['age'] > 30])
print(df[(df['age'] > 25) & (df['salary'] > 60000)])

# 使用loc和iloc
print(df.loc[0, 'name'])                    # 单个值
print(df.loc[0:2, ['name', 'age']])         # 多行多列
print(df.iloc[0:3, 0:2])                    # 按位置选择
print(df.loc[df['age'] > 30, 'name'])       # 条件选择

4. 数据处理

# 处理缺失值
print(df.isnull().sum())           # 查看缺失值
print(df.dropna())                 # 删除缺失值
print(df.fillna(0))                # 填充缺失值
print(df.fillna(df.mean()))        # 用均值填充

# 删除重复值
print(df.drop_duplicates())

# 数据类型转换
df['age'] = df['age'].astype(float)
df['date'] = pd.to_datetime(df['date'])

# 添加新列
df['age_group'] = pd.cut(df['age'], 
                         bins=[0, 30, 40, 50, 100], 
                         labels=['青年', '中青年', '中年', '老年'])

# 应用函数
df['name_upper'] = df['name'].apply(lambda x: x.upper())
df['salary_k'] = df['salary'].apply(lambda x: x / 1000)

# 分组聚合
grouped = df.groupby('city')['salary'].mean()
grouped = df.groupby(['city', 'age_group']).agg({
    'salary': ['mean', 'sum', 'count'],
    'age': 'mean'
})

# 数据透视表
pivot = pd.pivot_table(df, 
                       values='salary', 
                       index='city', 
                       columns='age_group', 
                       aggfunc='mean')

# 排序
df_sorted = df.sort_values('salary', ascending=False)
df_sorted = df.sort_values(['city', 'salary'], ascending=[True, False])

# 合并数据
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})

merged = pd.merge(df1, df2, on='key', how='inner')   # 内连接
merged = pd.merge(df1, df2, on='key', how='left')    # 左连接
merged = pd.merge(df1, df2, on='key', how='outer')   # 外连接

# 连接
df_concat = pd.concat([df1, df2], axis=0)  # 垂直连接
df_concat = pd.concat([df1, df2], axis=1)  # 水平连接

数据可视化

1. Matplotlib基础

import matplotlib.pyplot as plt

# 基础绘图
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.figure(figsize=(10, 6))
plt.plot(x, y, 'b-', linewidth=2, label='sin(x)')
plt.xlabel('X轴')
plt.ylabel('Y轴')
plt.title('正弦函数')
plt.legend()
plt.grid(True)
plt.show()

# 多子图
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes[0, 0].plot(x, np.sin(x))
axes[0, 0].set_title('sin(x)')

axes[0, 1].plot(x, np.cos(x))
axes[0, 1].set_title('cos(x)')

axes[1, 0].plot(x, np.tan(x))
axes[1, 0].set_title('tan(x)')

axes[1, 1].plot(x, x**2)
axes[1, 1].set_title('x^2')

plt.tight_layout()
plt.show()

# 柱状图
categories = ['A', 'B', 'C', 'D']
values = [23, 45, 56, 78]

plt.bar(categories, values, color=['red', 'green', 'blue', 'orange'])
plt.xlabel('类别')
plt.ylabel('数值')
plt.title('柱状图示例')
plt.show()

# 散点图
x = np.random.randn(100)
y = np.random.randn(100)
colors = np.random.rand(100)
sizes = 1000 * np.random.rand(100)

plt.scatter(x, y, c=colors, s=sizes, alpha=0.5)
plt.colorbar()
plt.show()

# 直方图
data = np.random.randn(1000)
plt.hist(data, bins=30, alpha=0.7, color='blue', edgecolor='black')
plt.xlabel('数值')
plt.ylabel('频数')
plt.title('直方图')
plt.show()

2. Seaborn高级可视化

import seaborn as sns

# 设置样式
sns.set_style('whitegrid')

# 加载示例数据
tips = sns.load_dataset('tips')

# 分布图
sns.histplot(tips['total_bill'], kde=True)
plt.title('账单金额分布')
plt.show()

# 箱线图
sns.boxplot(x='day', y='total_bill', data=tips)
plt.title('每日账单箱线图')
plt.show()

# 小提琴图
sns.violinplot(x='day', y='total_bill', data=tips)
plt.title('每日账单小提琴图')
plt.show()

# 散点图
sns.scatterplot(x='total_bill', y='tip', hue='sex', data=tips)
plt.title('账单与小费关系')
plt.show()

# 热力图
correlation = tips[['total_bill', 'tip', 'size']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('相关性热力图')
plt.show()

# 成对关系图
sns.pairplot(tips, hue='sex')
plt.show()

# 分面网格
g = sns.FacetGrid(tips, col='time', row='smoker')
g.map(sns.scatterplot, 'total_bill', 'tip')
plt.show()

实战案例

案例1：销售数据分析

# 读取销售数据
sales_data = pd.read_csv('sales_data.csv')

# 数据清洗
sales_data['date'] = pd.to_datetime(sales_data['date'])
sales_data = sales_data.dropna()

# 分析
# 1. 月度销售额趋势
monthly_sales = sales_data.groupby(sales_data['date'].dt.to_period('M'))['amount'].sum()

plt.figure(figsize=(12, 6))
monthly_sales.plot(kind='line', marker='o')
plt.title('月度销售额趋势')
plt.xlabel('月份')
plt.ylabel('销售额')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. 产品类别销售占比
category_sales = sales_data.groupby('category')['amount'].sum()
plt.figure(figsize=(8, 8))
plt.pie(category_sales, labels=category_sales.index, autopct='%1.1f%%')
plt.title('产品类别销售占比')
plt.show()

# 3. 地区销售对比
region_sales = sales_data.groupby('region')['amount'].sum().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
region_sales.plot(kind='bar')
plt.title('地区销售额对比')
plt.xlabel('地区')
plt.ylabel('销售额')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 4. 客户价值分析
customer_value = sales_data.groupby('customer_id').agg({
    'amount': 'sum',
    'order_id': 'count'
}).rename(columns={'order_id': 'order_count'})

# RFM分析
from datetime import datetime
reference_date = sales_data['date'].max()

rfm = sales_data.groupby('customer_id').agg({
    'date': lambda x: (reference_date - x.max()).days,  # Recency
    'order_id': 'count',  # Frequency
    'amount': 'sum'  # Monetary
}).rename(columns={
    'date': 'recency',
    'order_id': 'frequency',
    'amount': 'monetary'
})

# 客户分群
rfm['r_score'] = pd.qcut(rfm['recency'], 5, labels=[5,4,3,2,1])
rfm['f_score'] = pd.qcut(rfm['frequency'].rank(method='first'), 5, labels=[1,2,3,4,5])
rfm['m_score'] = pd.qcut(rfm['monetary'], 5, labels=[1,2,3,4,5])

rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str) + rfm['m_score'].astype(str)

def segment_customer(row):
    if row['rfm_score'] in ['555', '554', '544', '545', '454', '455', '445']:
        return '重要价值客户'
    elif row['rfm_score'] in ['543', '444', '435', '355', '354', '345', '344', '335']:
        return '重要保持客户'
    elif row['rfm_score'] in ['512', '511', '422', '421', '412', '411', '311']:
        return '新客户'
    elif row['rfm_score'] in ['155', '154', '144', '214', '215', '115', '114']:
        return '流失客户'
    else:
        return '一般客户'

rfm['segment'] = rfm.apply(segment_customer, axis=1)

# 可视化客户分群
segment_counts = rfm['segment'].value_counts()
plt.figure(figsize=(10, 6))
segment_counts.plot(kind='bar')
plt.title('客户分群统计')
plt.xlabel('客户类型')
plt.ylabel('客户数量')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

总结

Python数据分析的核心工具包括：

NumPy：高效的数值计算和数组操作
Pandas：强大的数据处理和分析工具
Matplotlib：灵活的基础可视化库
Seaborn：基于Matplotlib的高级可视化库

掌握这些工具，可以高效地完成数据清洗、处理、分析和可视化的全流程工作。建议多进行实战练习，结合实际业务场景提升数据分析能力。