pig分析脚本
来源:互联网 发布:什么是数据切片 编辑:程序博客网 时间:2024/05/22 01:54
--读取数据
data = LOAD '/user/mapred/PigData.txt' USING PigStorage('|') AS ( imsi:chararray,time:chararray,loc:chararray);
--转换格式
REGISTER /home/mapred/software/hadoops/pig/pig-0.11.1/contrib/piggybank/java/piggybank.jar;
REGISTER /home/mapred/practise/joda-time-2.0.jar;
DEFINE CustomFormatToISO org.apache.pig.piggybank.evaluation.datetime.convert.CustomFormatToISO();
toISO = FOREACH data GENERATE imsi, CustomFormatToISO( SUBSTRING(time,0,13),'YYYY-MM-dd HH') AS time:chararray,loc;
--数据分组
grp = GROUP toISO BY imsi;
--连续获取数据
REGISTER /home/mapred/practise/datafu-1.2.0.jar
DEFINE MarkovPairs datafu.pig.stats.MarkovPairs();
pairs = FOREACH grp
{
sorted = ORDER toISO BY time;
pair = MarkovPairs(sorted);
GENERATE FLATTEN(pair) AS (data:tuple(imsi,time,loc),next:tuple(imsi,time,loc) );
}
--展开数据
prj = FOREACH pairs GENERATE data.imsi AS imsi,data.time AS time,next.time AS next_time,data.loc AS loc,next.loc AS next_loc;
DEFINE ISODaysBetween org.apache.pig.piggybank.evaluation.datetime.diff.ISODaysBetween();
flt = FILTER prj BY ISODaysBetween(next_time, time) == 0L;
--计算每一个位置的总数
total_count = FOREACH (GROUP flt BY loc) GENERATE group AS loc,COUNT(flt) AS total;
--计算每一对位置的数目
pairs_count = FOREACH (GROUP flt by (loc,next_loc) ) GENERATE FLATTEN(group) AS (loc,next_loc),COUNT(flt) AS cnt;
jnd = JOIN pairs_count BY loc,total_count BY loc USING 'replicated';
prob = FOREACH jnd GENERATE pairs_count::loc AS loc, pairs_count::next_loc AS next_loc,(double)cnt/(double)total AS probability;
top3 = FOREACH (GROUP prob BY loc)
{
sorted = ORDER prob BY probability DESC;
top = LIMIT sorted 3;
GENERATE FLATTEN(top);
};
STORE top3 INTO 'output';
cat output;
data = LOAD '/user/mapred/PigData.txt' USING PigStorage('|') AS ( imsi:chararray,time:chararray,loc:chararray);
--转换格式
REGISTER /home/mapred/software/hadoops/pig/pig-0.11.1/contrib/piggybank/java/piggybank.jar;
REGISTER /home/mapred/practise/joda-time-2.0.jar;
DEFINE CustomFormatToISO org.apache.pig.piggybank.evaluation.datetime.convert.CustomFormatToISO();
toISO = FOREACH data GENERATE imsi, CustomFormatToISO( SUBSTRING(time,0,13),'YYYY-MM-dd HH') AS time:chararray,loc;
--数据分组
grp = GROUP toISO BY imsi;
--连续获取数据
REGISTER /home/mapred/practise/datafu-1.2.0.jar
DEFINE MarkovPairs datafu.pig.stats.MarkovPairs();
pairs = FOREACH grp
{
sorted = ORDER toISO BY time;
pair = MarkovPairs(sorted);
GENERATE FLATTEN(pair) AS (data:tuple(imsi,time,loc),next:tuple(imsi,time,loc) );
}
--展开数据
prj = FOREACH pairs GENERATE data.imsi AS imsi,data.time AS time,next.time AS next_time,data.loc AS loc,next.loc AS next_loc;
DEFINE ISODaysBetween org.apache.pig.piggybank.evaluation.datetime.diff.ISODaysBetween();
flt = FILTER prj BY ISODaysBetween(next_time, time) == 0L;
--计算每一个位置的总数
total_count = FOREACH (GROUP flt BY loc) GENERATE group AS loc,COUNT(flt) AS total;
--计算每一对位置的数目
pairs_count = FOREACH (GROUP flt by (loc,next_loc) ) GENERATE FLATTEN(group) AS (loc,next_loc),COUNT(flt) AS cnt;
jnd = JOIN pairs_count BY loc,total_count BY loc USING 'replicated';
prob = FOREACH jnd GENERATE pairs_count::loc AS loc, pairs_count::next_loc AS next_loc,(double)cnt/(double)total AS probability;
top3 = FOREACH (GROUP prob BY loc)
{
sorted = ORDER prob BY probability DESC;
top = LIMIT sorted 3;
GENERATE FLATTEN(top);
};
STORE top3 INTO 'output';
cat output;
0 0
- pig分析脚本
- pig 分析 脚本
- pig 分析 脚本
- 【pig】pig脚本规范
- PIG--shell脚本
- pig脚本总结
- PIG LATIN分析报告
- PIG LATIN分析报告
- Pig常见错误分析
- pig分析日志脚本(1) 统计行数和单词个数wordcount
- Hadoop生态系统搭建(2)——数据分析脚本 Pig 的安装部署与测试
- pig脚本记录,对于pig脚本跑批处理
- Pig脚本书写时候注意事项
- 使用 Pig 进行数据分析
- 向pig脚本中传入参数
- 向Pig脚本中传入参数
- pig
- pig
- yii rules规则中 unique的验证
- iOS 数据持久化四-SQLite3(1)
- WinDBG 技巧:如何生成Dump 文件(.dump 命令)
- 使用Memory Analyzer tool(MAT)分析内存泄漏(二)
- PHP for循环
- pig分析脚本
- Xcode5 添加.a库编译错误
- day7面向对象7
- java中的Class对象、new关键字
- CentOS安装截图工具 gnome-screenshot
- VirtualBox中linux系统如何全屏显示
- APIDemo分析:fragment的使用方法
- IIS网站设置禁止IP访问设置方法
- 谈计算机网络安全的管控路径