LibDataFrame 是一个用于数据处理和分析的 C++ 库,API 风格极度模仿 Python 的 pandas 库。它提供高效的数据结构和算法,支持多种数据格式和操作,使 C++ 数据分析变得简单直观。
git clone https://github.com/hosseinmoein/DataFrame.git
cd DataFrame
mkdir build && cd build
cmake ..
cmake --build .
sudo make install
#include <DataFrame/DataFrame.h>
#include <DataFrame/DataFrame.h>
#include <iostream>
using namespace hmdf;
int main() {
// 创建一个 DataFrame
MyDataFrame df;
// 添加列
std::vector<int> ids = {1, 2, 3, 4, 5};
std::vector<std::string> names = {"Alice", "Bob", "Charlie", "David", "Eve"};
std::vector<int> ages = {25, 30, 35, 28, 32};
df.load_column("id", ids);
df.load_column("name", names);
df.load_column("age", ages);
// 显示 DataFrame
std::cout << "DataFrame:" << std::endl;
std::cout << df << std::endl;
return 0;
}
#include <DataFrame/DataFrame.h>
#include <iostream>
using namespace hmdf;
int main() {
// 从 CSV 文件读取
MyDataFrame df = MyDataFrame::read_csv("data.csv");
// 显示前几行
std::cout << "First 5 rows:" << std::endl;
df.get_head<std::string, int>({"name", "age"}, 5);
return 0;
}
#include <DataFrame/DataFrame.h>
using namespace hmdf;
int main() {
MyDataFrame df;
// 创建数据
std::vector<int> ids = {1, 2, 3};
std::vector<std::string> names = {"Alice", "Bob", "Charlie"};
df.load_column("id", ids);
df.load_column("name", names);
// 写入 CSV
df.write<std::string>("output.csv", false);
return 0;
}
#include <DataFrame/DataFrame.h>
#include <iostream>
using namespace hmdf;
int main() {
MyDataFrame df;
// 创建数据
std::vector<int> ages = {25, 30, 35, 28, 32, 40, 45};
std::vector<std::string> names = {"Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace"};
std::vector<int> scores = {85, 90, 95, 88, 92, 78, 96};
df.load_column("age", ages);
df.load_column("name", names);
df.load_column("score", scores);
// 筛选 age > 30 的行
auto filtered_df = df.filter<int>("age", [](int age) {
return age > 30;
});
std::cout << "People older than 30:" << std::endl;
std::cout << filtered_df << std::endl;
return 0;
}
#include <DataFrame/DataFrame.h>
#include <iostream>
using namespace hmdf;
int main() {
MyDataFrame df;
std::vector<std::string> names = {"Alice", "Bob", "Charlie"};
std::vector<int> ages = {25, 30, 35};
std::vector<int> scores = {85, 90, 95};
df.load_column("name", names);
df.load_column("age", ages);
df.load_column("score", scores);
// 选择特定列
auto selected = df.get_subset<std::string, int>({"name", "age"});
std::cout << "Selected columns:" << std::endl;
std::cout << selected << std::endl;
return 0;
}
#include <DataFrame/DataFrame.h>
#include <iostream>
using namespace hmdf;
int main() {
MyDataFrame df;
std::vector<std::string> names = {"Alice", "Bob", "Charlie", "David"};
std::vector<int> scores = {88, 92, 85, 95};
df.load_column("name", names);
df.load_column("score", scores);
// 按 score 升序排序
df.sort<int>("score", sort_order::ASCEND);
std::cout << "Sorted by score (ascending):" << std::endl;
std::cout << df << std::endl;
// 按 score 降序排序
df.sort<int>("score", sort_order::DESCEND);
std::cout << "Sorted by score (descending):" << std::endl;
std::cout << df << std::endl;
return 0;
}
#include <DataFrame/DataFrame.h>
#include <iostream>
#include <map>
using namespace hmdf;
int main() {
MyDataFrame df;
// 创建分组数据
std::vector<std::string> departments = {"IT", "HR", "IT", "Sales", "HR", "IT"};
std::vector<std::string> names = {"Alice", "Bob", "Charlie", "David", "Eve", "Frank"};
std::vector<int> salaries = {80000, 60000, 90000, 75000, 65000, 85000};
df.load_column("department", departments);
df.load_column("name", names);
df.load_column("salary", salaries);
// 按部门分组并计算平均薪资
auto result = df.groupby<std::string>(
"department",
MyDataFrame::groupby_ops::SUM,
std::vector<std::string>{"salary"}
);
std::cout << "Total salary by department:" << std::endl;
std::cout << result << std::endl;
return 0;
}
#include <DataFrame/DataFrame.h>
#include <iostream>
#include <numeric>
using namespace hmdf;
int main() {
MyDataFrame df;
std::vector<std::string> groups = {"A", "B", "A", "B", "A"};
std::vector<double> values = {1.0, 2.0, 3.0, 4.0, 5.0};
df.load_column("group", groups);
df.load_column("value", values);
// 自定义聚合:计算平均值
auto mean_agg = [](const std::vector<double>& vec) {
if (vec.empty()) return 0.0;
double sum = std::accumulate(vec.begin(), vec.end(), 0.0);
return sum / vec.size();
};
// 应用自定义聚合
auto grouped = df.groupby<std::string>("group", {{"value", mean_agg}});
std::cout << "Mean by group:" << std::endl;
std::cout << grouped << std::endl;
return 0;
}
#include <DataFrame/DataFrame.h>
#include <iostream>
using namespace hmdf;
int main() {
MyDataFrame df1, df2;
// 第一个 DataFrame
std::vector<int> ids1 = {1, 2, 3};
std::vector<std::string> names1 = {"Alice", "Bob", "Charlie"};
df1.load_column("id", ids1);
df1.load_column("name", names1);
// 第二个 DataFrame
std::vector<int> ids2 = {1, 2, 3};
std::vector<int> scores = {85, 90, 95};
df2.load_column("id", ids2);
df2.load_column("score", scores);
// 按列合并
auto merged = df1.join<int>(
df2,
join_type::INNER,
{"id"},
{"id"},
{"name"},
{"score"}
);
std::cout << "Merged DataFrame:" << std::endl;
std::cout << merged << std::endl;
return 0;
}
#include <DataFrame/DataFrame.h>
#include <iostream>
using namespace hmdf;
int main() {
// 模拟销售数据
MyDataFrame sales_df;
std::vector<std::string> products = {"A", "B", "C", "A", "B", "C", "A"};
std::vector<std::string> regions = {"North", "South", "East", "North", "South", "East", "West"};
std::vector<int> quantities = {10, 20, 15, 5, 25, 12, 18};
std::vector<double> prices = {100.0, 200.0, 150.0, 100.0, 200.0, 150.0, 100.0};
sales_df.load_column("product", products);
sales_df.load_column("region", regions);
sales_df.load_column("quantity", quantities);
sales_df.load_column("price", prices);
// 计算总销售额
std::vector<double> total_sales;
for (size_t i = 0; i < quantities.size(); ++i) {
total_sales.push_back(quantities[i] * prices[i]);
}
sales_df.load_column("total_sales", total_sales);
// 筛选销售额大于 1000 的记录
auto high_sales = sales_df.filter<double>("total_sales", [](double sales) {
return sales > 2000.0;
});
std::cout << "High sales records:" << std::endl;
std::cout << high_sales << std::endl;
// 按产品分组统计总销售额
auto by_product = sales_df.groupby<std::string>(
"product",
MyDataFrame::groupby_ops::SUM,
std::vector<std::string>{"total_sales"}
);
std::cout << "Total sales by product:" << std::endl;
std::cout << by_product << std::endl;
return 0;
}
#include <DataFrame/DataFrame.h>
#include <iostream>
#include <chrono>
using namespace hmdf;
int main() {
MyDataFrame time_df;
// 模拟时间序列数据
std::vector<std::string> timestamps = {
"2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04", "2024-01-05"
};
std::vector<double> prices = {100.5, 102.3, 101.8, 103.2, 104.5};
time_df.load_column("timestamp", timestamps);
time_df.load_column("price", prices);
// 计算移动平均
std::vector<double> ma_3day;
for (size_t i = 0; i < prices.size(); ++i) {
if (i < 2) {
ma_3day.push_back(prices[i]);
} else {
double sum = prices[i] + prices[i-1] + prices[i-2];
ma_3day.push_back(sum / 3.0);
}
}
time_df.load_column("ma_3day", ma_3day);
std::cout << "Time series with moving average:" << std::endl;
std::cout << time_df << std::endl;
return 0;
}
| 操作 | 示例 | 说明 |
|---|---|---|
load_column() |
df.load_column("name", data) |
添加列 |
filter() |
df.filter<int>("age", [](int x){return x>30;}) |
过滤行 |
sort() |
df.sort<int>("age", ASCEND) |
排序 |
groupby() |
df.groupby<string>("key", SUM, {"value"}) |
分组聚合 |
join() |
df1.join<int>(df2, INNER, {"id"}, {"id"}) |
连接 DataFrame |
get_head() |
df.get_head<int>({"col"}, 5) |
获取前 N 行 |
read_csv() |
MyDataFrame::read_csv("file.csv") |
读取 CSV |
write() |
df.write<string>("output.csv") |
写入 CSV |