LibDataFrame 使用指南

LibDataFrame 是一个用于数据处理和分析的 C++ 库，API 风格极度模仿 Python 的 pandas 库。它提供高效的数据结构和算法，支持多种数据格式和操作，使 C++ 数据分析变得简单直观。

📦 安装

从 GitHub 获取

git clone https://github.com/hosseinmoein/DataFrame.git
cd DataFrame
mkdir build && cd build
cmake ..
cmake --build .
sudo make install

包含头文件

#include <DataFrame/DataFrame.h>

🚀 基础使用

创建 DataFrame

#include <DataFrame/DataFrame.h>
#include <iostream>

using namespace hmdf;

int main() {
    // 创建一个 DataFrame
    MyDataFrame df;

    // 添加列
    std::vector<int> ids = {1, 2, 3, 4, 5};
    std::vector<std::string> names = {"Alice", "Bob", "Charlie", "David", "Eve"};
    std::vector<int> ages = {25, 30, 35, 28, 32};

    df.load_column("id", ids);
    df.load_column("name", names);
    df.load_column("age", ages);

    // 显示 DataFrame
    std::cout << "DataFrame:" << std::endl;
    std::cout << df << std::endl;

    return 0;
}

读取 CSV 文件

#include <DataFrame/DataFrame.h>
#include <iostream>

using namespace hmdf;

int main() {
    // 从 CSV 文件读取
    MyDataFrame df = MyDataFrame::read_csv("data.csv");

    // 显示前几行
    std::cout << "First 5 rows:" << std::endl;
    df.get_head<std::string, int>({"name", "age"}, 5);

    return 0;
}

写入 CSV 文件

#include <DataFrame/DataFrame.h>

using namespace hmdf;

int main() {
    MyDataFrame df;

    // 创建数据
    std::vector<int> ids = {1, 2, 3};
    std::vector<std::string> names = {"Alice", "Bob", "Charlie"};

    df.load_column("id", ids);
    df.load_column("name", names);

    // 写入 CSV
    df.write<std::string>("output.csv", false);

    return 0;
}

🔧 数据选择和过滤

筛选数据 (filter)

#include <DataFrame/DataFrame.h>
#include <iostream>

using namespace hmdf;

int main() {
    MyDataFrame df;

    // 创建数据
    std::vector<int> ages = {25, 30, 35, 28, 32, 40, 45};
    std::vector<std::string> names = {"Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace"};
    std::vector<int> scores = {85, 90, 95, 88, 92, 78, 96};

    df.load_column("age", ages);
    df.load_column("name", names);
    df.load_column("score", scores);

    // 筛选 age > 30 的行
    auto filtered_df = df.filter<int>("age", [](int age) {
        return age > 30;
    });

    std::cout << "People older than 30:" << std::endl;
    std::cout << filtered_df << std::endl;

    return 0;
}

选择列

#include <DataFrame/DataFrame.h>
#include <iostream>

using namespace hmdf;

int main() {
    MyDataFrame df;

    std::vector<std::string> names = {"Alice", "Bob", "Charlie"};
    std::vector<int> ages = {25, 30, 35};
    std::vector<int> scores = {85, 90, 95};

    df.load_column("name", names);
    df.load_column("age", ages);
    df.load_column("score", scores);

    // 选择特定列
    auto selected = df.get_subset<std::string, int>({"name", "age"});

    std::cout << "Selected columns:" << std::endl;
    std::cout << selected << std::endl;

    return 0;
}

📊 数据排序

按列排序 (sort)

#include <DataFrame/DataFrame.h>
#include <iostream>

using namespace hmdf;

int main() {
    MyDataFrame df;

    std::vector<std::string> names = {"Alice", "Bob", "Charlie", "David"};
    std::vector<int> scores = {88, 92, 85, 95};

    df.load_column("name", names);
    df.load_column("score", scores);

    // 按 score 升序排序
    df.sort<int>("score", sort_order::ASCEND);

    std::cout << "Sorted by score (ascending):" << std::endl;
    std::cout << df << std::endl;

    // 按 score 降序排序
    df.sort<int>("score", sort_order::DESCEND);

    std::cout << "Sorted by score (descending):" << std::endl;
    std::cout << df << std::endl;

    return 0;
}

📈 分组和聚合

GroupBy 和聚合操作

#include <DataFrame/DataFrame.h>
#include <iostream>
#include <map>

using namespace hmdf;

int main() {
    MyDataFrame df;

    // 创建分组数据
    std::vector<std::string> departments = {"IT", "HR", "IT", "Sales", "HR", "IT"};
    std::vector<std::string> names = {"Alice", "Bob", "Charlie", "David", "Eve", "Frank"};
    std::vector<int> salaries = {80000, 60000, 90000, 75000, 65000, 85000};

    df.load_column("department", departments);
    df.load_column("name", names);
    df.load_column("salary", salaries);

    // 按部门分组并计算平均薪资
    auto result = df.groupby<std::string>(
        "department",
        MyDataFrame::groupby_ops::SUM,
        std::vector<std::string>{"salary"}
    );

    std::cout << "Total salary by department:" << std::endl;
    std::cout << result << std::endl;

    return 0;
}

自定义聚合函数

#include <DataFrame/DataFrame.h>
#include <iostream>
#include <numeric>

using namespace hmdf;

int main() {
    MyDataFrame df;

    std::vector<std::string> groups = {"A", "B", "A", "B", "A"};
    std::vector<double> values = {1.0, 2.0, 3.0, 4.0, 5.0};

    df.load_column("group", groups);
    df.load_column("value", values);

    // 自定义聚合：计算平均值
    auto mean_agg = [](const std::vector<double>& vec) {
        if (vec.empty()) return 0.0;
        double sum = std::accumulate(vec.begin(), vec.end(), 0.0);
        return sum / vec.size();
    };

    // 应用自定义聚合
    auto grouped = df.groupby<std::string>("group", {{"value", mean_agg}});

    std::cout << "Mean by group:" << std::endl;
    std::cout << grouped << std::endl;

    return 0;
}

🔗 数据合并和连接

合并 DataFrame

#include <DataFrame/DataFrame.h>
#include <iostream>

using namespace hmdf;

int main() {
    MyDataFrame df1, df2;

    // 第一个 DataFrame
    std::vector<int> ids1 = {1, 2, 3};
    std::vector<std::string> names1 = {"Alice", "Bob", "Charlie"};

    df1.load_column("id", ids1);
    df1.load_column("name", names1);

    // 第二个 DataFrame
    std::vector<int> ids2 = {1, 2, 3};
    std::vector<int> scores = {85, 90, 95};

    df2.load_column("id", ids2);
    df2.load_column("score", scores);

    // 按列合并
    auto merged = df1.join<int>(
        df2,
        join_type::INNER,
        {"id"},
        {"id"},
        {"name"},
        {"score"}
    );

    std::cout << "Merged DataFrame:" << std::endl;
    std::cout << merged << std::endl;

    return 0;
}

💼 实际应用示例

数据分析流程

#include <DataFrame/DataFrame.h>
#include <iostream>

using namespace hmdf;

int main() {
    // 模拟销售数据
    MyDataFrame sales_df;

    std::vector<std::string> products = {"A", "B", "C", "A", "B", "C", "A"};
    std::vector<std::string> regions = {"North", "South", "East", "North", "South", "East", "West"};
    std::vector<int> quantities = {10, 20, 15, 5, 25, 12, 18};
    std::vector<double> prices = {100.0, 200.0, 150.0, 100.0, 200.0, 150.0, 100.0};

    sales_df.load_column("product", products);
    sales_df.load_column("region", regions);
    sales_df.load_column("quantity", quantities);
    sales_df.load_column("price", prices);

    // 计算总销售额
    std::vector<double> total_sales;
    for (size_t i = 0; i < quantities.size(); ++i) {
        total_sales.push_back(quantities[i] * prices[i]);
    }
    sales_df.load_column("total_sales", total_sales);

    // 筛选销售额大于 1000 的记录
    auto high_sales = sales_df.filter<double>("total_sales", [](double sales) {
        return sales > 2000.0;
    });

    std::cout << "High sales records:" << std::endl;
    std::cout << high_sales << std::endl;

    // 按产品分组统计总销售额
    auto by_product = sales_df.groupby<std::string>(
        "product",
        MyDataFrame::groupby_ops::SUM,
        std::vector<std::string>{"total_sales"}
    );

    std::cout << "Total sales by product:" << std::endl;
    std::cout << by_product << std::endl;

    return 0;
}

时间序列数据处理

#include <DataFrame/DataFrame.h>
#include <iostream>
#include <chrono>

using namespace hmdf;

int main() {
    MyDataFrame time_df;

    // 模拟时间序列数据
    std::vector<std::string> timestamps = {
        "2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04", "2024-01-05"
    };
    std::vector<double> prices = {100.5, 102.3, 101.8, 103.2, 104.5};

    time_df.load_column("timestamp", timestamps);
    time_df.load_column("price", prices);

    // 计算移动平均
    std::vector<double> ma_3day;
    for (size_t i = 0; i < prices.size(); ++i) {
        if (i < 2) {
            ma_3day.push_back(prices[i]);
        } else {
            double sum = prices[i] + prices[i-1] + prices[i-2];
            ma_3day.push_back(sum / 3.0);
        }
    }
    time_df.load_column("ma_3day", ma_3day);

    std::cout << "Time series with moving average:" << std::endl;
    std::cout << time_df << std::endl;

    return 0;
}

📋 常用操作速查

操作	示例	说明
`load_column()`	`df.load_column("name", data)`	添加列
`filter()`	`df.filter<int>("age", [](int x){return x>30;})`	过滤行
`sort()`	`df.sort<int>("age", ASCEND)`	排序
`groupby()`	`df.groupby<string>("key", SUM, {"value"})`	分组聚合
`join()`	`df1.join<int>(df2, INNER, {"id"}, {"id"})`	连接 DataFrame
`get_head()`	`df.get_head<int>({"col"}, 5)`	获取前 N 行
`read_csv()`	`MyDataFrame::read_csv("file.csv")`	读取 CSV
`write()`	`df.write<string>("output.csv")`	写入 CSV

💡 提示： LibDataFrame 的 API 设计灵感来自 pandas，如果你熟悉 pandas，你会发现 LibDataFrame 的用法非常相似，让你在 C++ 中也能享受类似的数据分析体验。

⚠️ 注意：

LibDataFrame 需要 C++17 或更高版本
编译时需要链接相应的库文件
大型数据集操作时注意内存使用
某些高级功能可能需要额外的依赖