练习使用的数据已上传
链接:https://pan.baidu.com/s/1L5znszdXLUytH9qvTdO4JA
提取码:lzyd
通过Hive完成word count作业。
hive> create table `article` (`sentence` string);
OK
Time taken: 1.019 seconds
hive> load data local inpath '/mnt/hgfs/vm_shared/The_Man_of_Property.txt' overwrite into table article;
Loading data to table default.article
Table default.article stats: [numFiles=1, numRows=0, totalSize=632207, rawDataSize=0]
OK
Time taken: 1.386 seconds
split
方法,通过空格切分文章hive> select split(sentence," ") from article;
["Preface"]
["“The","Forsyte","Saga”","was","the","title","originally","destined","for","that","part","of","it","which","is","called","“The","Man","of","Property”;","and","to","adopt","it","for","the","collected","chronicles","of","the","Forsyte","family","has","indulged","the","Forsytean","tenacity","that","is","in","all","of","us.","The","word","Saga","might","be","objected","to","on","the","ground","that","it","connotes","the","heroic","and","that","there","is","little","heroism","in","these","pages.","But","it","is","used","with","a","suitable","irony;","and,","after","all,","this","long","tale,","though","it","may","deal","with","folk","in","frock","coats,","furbelows,","and","a","gilt-edged","period,","is","not","devoid","of","the","essential","heat","of","conflict.","Discounting","for","the","gigantic","stature","and","blood-thirstiness","of","old","days,","as","they","have","come","down","to","us","in","fairy-tale","and","legend,","the","folk","of","the","old","Sagas","were","Forsytes,","assuredly,","in","their","possessive","instincts,","and","as","little","proof","against","the","inroads","of","beauty","and","passion","as","Swithin,","Soames,","or","even","Young","Jolyon.","And","if","heroic","figures,","in","days","that","never","were,","seem","to","startle","out","from","their","surroundings","in","fashion","unbecoming","to","a","Forsyte","of","the","Victorian","era,","we","may","be","sure","that","tribal","instinct","was","even","then","the","prime","force,","and","that","“family”","and","the","sense","of","home","and","property","counted","as","they","do","to","this","day,","for","all","the","recent","efforts","to","“talk","them","out.”"]
["So","many","people","have","written","and","claimed","that","their","families","were","the","originals","of","the","Forsytes","that","one","has","been","almost","encouraged","to","believe","in","the","typicality","of","an","imagined","species.","Manners","change","and","modes","evolve,","and","“Timothy’s","on","the","Bayswater","Road”","becomes","a","nest","of","the","unbelievable","in","all","except","essentials;","we","shall","not","look","upon","its","like","again,","nor","perhaps","on","such","a","one","as","James","or","Old","Jolyon.","And","yet","the","figures","of","Insurance","Societies","and","the","utterances","of","Judges","reassure","us","daily","that","our","earthly","paradise","is","still","a","rich","preserve,","where","the","wild","raiders,","Beauty","and","Passion,","come","stealing","in,","filching","security","from","beneath","our","noses.","As","surely","as","a","dog","will","bark","at","a","brass","band,","so","will","the","essential","Soames","in","human","nature","ever","rise","up","uneasily","against","the","dissolution","which","hovers","round","the","folds","of","ownership."]
["“Let","the","dead","Past","bury","its","dead”","would","be","a","better","saying","if","the","Past","ever","died.","The","persistence","of","the","Past","is","one","of","those","tragi-comic","blessings","which","each","new","age","denies,","coming","cocksure","on","to","the","stage","to","mouth","its","claim","to","a","perfect","novelty."]
["But","no","Age","is","so","new","as","that!","Human","Nature,","under","its","changing","pretensions","and","clothes,","is","and","ever","will","be","very","much","of","a","Forsyte,","and","might,","after","all,","be","a","much","worse","animal."]
...
["The","End"]
Time taken: 0.086 seconds, Fetched: 2866 row(s)
可以看到运行结果是许多个字符串数组。一共2866条数据
wc
命令查看文件行数wc The_Man_of_Property.txt
2866 111783 632207 The_Man_of_Property.txt
行数 字数 字节数 文件名称
split
会将每一行句子分到一个数组中。explode
实现。hive> select explode(split(sentence," ")) from article;
...
we
are
not
at
home.”
And
in
young
Jolyon’s
face
he
slammed
the
door.
The
End
Time taken: 0.085 seconds, Fetched: 111818 row(s)
select regexp_extract(word,'[a-zA-Z]+',0) from (select explode(split(sentence," ")) word from article) t;
...
at
home
And
in
young
Jolyon
face
he
slammed
the
door
The
End
Time taken: 0.066 seconds, Fetched: 111818 row(s)
select word, count(*)
from (
select regexp_extract(str,'[a-zA-Z]+[\’]*[a-zA-Z]+',0) word
from (
select explode(split(sentence," ")) str
from article
) t1
) t2
group by word;
......
yield 4
yielded 3
yielding 2
yields 1
you 522
young 198
younger 10
youngest 3
youngling 1
your 130
yours 2
yourself 22
yourselves 1
youth 10
you’d 14
you’ll 21
you’re 23
you’ve 25
Time taken: 27.26 seconds, Fetched: 9872 row(s)
select word, count(*) AS cnt
from (
select explode(split(sentence,' ')) word
from article
) t
group by word;
trains.csv (订单——商品)
----------------------
order_id:订单号
product_id:商品ID
add_to_cart_order:加入购物车的位置
reordered:这个订单是否重复购买(1 表示是 0 表示否)
orders.csv (数据仓库中定位:用户行为表)
----------------------
order_id:订单号
user_id:用户id
eval_set:订单的行为(历史产生的或者训练所需要的)
order_number:用户购买订单的先后顺序
order_dow:order day of week ,订单在星期几进行购买的(0-6)
order_hour_of_day:订单在哪个小时段产生的(0-23)
days_since_prior_order:表示后一个订单距离前一个订单的相隔天数
create table trains(
order_id string,
product_id string,
add_to_cart_order string,
reordered string
)row format delimited
fields terminated by ','
lines terminated by '\n';
hive> load data local inpath '/mnt/hgfs/vm_shared/trains.csv' overwrite into table trains;
Loading data to table default.trains
Table default.trains stats: [numFiles=1, numRows=0, totalSize=24680147, rawDataSize=0]
OK
Time taken: 1.801 seconds
hive> select * from trains limit 10;
OK
trains.order_id trains.product_id trains.add_to_cart_order trains.reordered
order_id product_id add_to_cart_order reordered
1 49302 1 1
1 11109 2 1
1 10246 3 0
1 49683 4 0
1 43633 5 1
1 13176 6 0
1 47209 7 0
1 22035 8 1
36 39612 1 0
Time taken: 0.1 seconds, Fetched: 10 row(s)
结果中第一行是我定义的列名,第二行是数据中自带的字段,因此第一行数据是引入的脏数据需要去除。
去除的方法有很多。
方法1. 现在已经导入数据了,可以通过HQL覆盖当前数据。
insert overwrite table trains
select * from trains where order_id !='order_id'
方法2. 可以在数据导入前直接对数据集操作删除第一行。
[root@node1 vm_shared]# head trains.csv
//这是数据集的前几行,是带字段名的
order_id,product_id,add_to_cart_order,reordered
1,49302,1,1
1,11109,2,1
1,10246,3,0
1,49683,4,0
1,43633,5,1
1,13176,6,0
1,47209,7,0
1,22035,8,1
36,39612,1,0
sed '1d' trains.csv> trains_tmp.csv
[root@node1 vm_shared]# head trains_tmp.csv
//可见第一行已经删掉了
1,49302,1,1
1,11109,2,1
1,10246,3,0
1,49683,4,0
1,43633,5,1
1,13176,6,0
1,47209,7,0
1,22035,8,1
36,39612,1,0
36,19660,2,1
方法3. 在建表中加入属性skip.header.line.count'='1'
,这样在导入数据会自动跳过第一行。如:
create table xxx(
...
)
row format delimited
fields terminated by '\t'
tblproperties ('skip.header.line.count'='1');
orders.csv
select user_id, count(*) from orders group by user_id ;
...
Time taken: 32.335 seconds, Fetched: 206209 row(s)
注意:使用聚合函数(count、sum、avg、max、min )的时候要结合group by 进行使用
create table priors(
order_id string,
product_id string,
add_to_cart_order string,
reordered string)
row format delimited
fields terminated by ','
lines terminated by '\n'
tblproperties ('skip.header.line.count'='1');
hive> load data local inpath '/mnt/hgfs/vm_shared/priors.csv' overwrite into table priors;
Loading data to table default.priors
Table default.priors stats: [numFiles=1, numRows=0, totalSize=577550706, rawDataSize=0]
OK
Time taken: 13.463 seconds
select order_id, count(product_id) cnt from priors group by order_id;
select o.user_id, sum(p.cnt)/count(o.order_id) from orders o join (select order_id, count(product_id) cnt from priors group by order_id) p on o.order_id=p.order_id group by user_id limit 10;
1 5.9
2 13.928571428571429
3 7.333333333333333
4 3.6
5 9.25
6 4.666666666666667
7 10.3
8 16.333333333333332
9 25.333333333333332
10 28.6
select
user_id
, sum(case when order_dow='0' then 1 else 0 end) dow0
, sum(case when order_dow='1' then 1 else 0 end) dow1
, sum(case when order_dow='2' then 1 else 0 end) dow2
, sum(case when order_dow='3' then 1 else 0 end) dow3
, sum(case when order_dow='4' then 1 else 0 end) dow4
, sum(case when order_dow='5' then 1 else 0 end) dow5
, sum(case when order_dow='6' then 1 else 0 end) dow6
from orders
group by user_id
训练集trains和priors数相同的表结构,在两个集合的全量数据中查找。
-- 通过with/as定义一个临时数据集
with user_pro_cnt_tmp as (
select * from
(
-- 订单训练数据
select
a.user_id,b.product_id
from orders as a
left join trains b
on a.order_id=b.order_id
union all
-- 订单历史数据
select
a.user_id,b.product_id
from orders as a
left join priors b
on a.order_id=b.order_id
) t
)
select
user_id
, count(distinct product_id) pro_cnt
from user_pro_cnt_tmp
group by user_id
having pro_cnt >= 100
limit 10;