测试数据集:
create external table if not exists order_detail(
user_id string,
device_id string,
user_type string,
price double,
sales int
)
row format delimited fields terminated by
'\t'
lines terminated by
'\n'
stored as textfile;
hdfs dfs -put /home/liguodong/data/data.text /temp/lgd
load data inpath
'/temp/lgd/data.text' overwrite into table order_detail;
select * from order_detail;
+----------+-------------+------------+--------+--------+--+
| user_id | device_id | user_type | price | sales |
+----------+-------------+------------+--------+--------+--+
| zhangsa | dfsadsa323 | new | 67.1 |
2 |
| lisi | 543gfd
| old | 43.32 |
1 |
| wanger | 65ghf
| new | 88.88 |
3 |
| liiu | fdsfagwe | new | 66.0 |
1 |
| qibaqiu | fds | new | 54.32 |
1 |
| wangshi | f332 | old | 77.77 |
2 |
| liwei | hfd | old | 88.44 |
3 |
| wutong | 543gdfsd
| new | 56.55 |
6 |
| lilisi | dsfgg | new | 88.88 |
5 |
| qishili | fds | new | 66.66 |
5 |
+----------+-------------+------------+--------+--------+--+
关系运算
## > < =
##注意: String 的比较要注意(常用的时间比较可以先 to
_date 之后再比较)
select long_time>short
_time, long_time<short
_time,long_time=short
_time, to_date(long
_time)=to_date(short
_time)
from
(
select '2017-01-11 00:00:00' as long_time,
'2017-01-11' as short
_time
from
order_detail limit 1
)bb;
result:
true false false true
## 空值判断
select 1 from order
_detail where NULL is NULL limit 1;
## 非空判断
select 1 from order
_detail where 1 is not NULL limit 1;
## LIKE
语法: A LIKE B
描述: 字符串A符合表达式B的正则语法,则为TRUE;否则为FALSE. B中字符”
_”表示任意单个字符,而字符”%”表示任意数量的字符。
select user
_id from order_detail where user
_id like 'li%';
+----------+--+
| lisi |
| liiu |
| liwei |
select user
_id from order_detail where user
_id like 'li__';
+----------+--+
| lisi |
## RLIKE
语法: A RLIKE B
描述: 字符串A符合JAVA正则表达式 B 的正则语法,则为 TRUE;否则为 FALSE。
select user
_id from order_detail where user
_id rlike '^l.*i$';
+----------+--+
| lisi |
| liwei |
select device
_id from order_detail where device
_id rlike '^\\w+$';
+-------------+--+
| dfsadsa323 |
| 543gfd |
| 65ghf |
| fdsfagwe |
| fds |
| f332 |
| hfd |
| 543gdfsd |
| dsfgg |
select device
_id from order_detail where device
_id rlike '^[a-zA-Z]+$';
+------------+--+
| fdsfagwe |
| fds |
| hfd |
| dsfgg |
select device
_id from order_detail where device
_id rlike '^[a-zA-Z]{4,}$';
+------------+--+
| fdsfagwe |
## REGEXP
语法: A REGEXP B
描述: 功能与 RLIKE 相同
select device
_id from order_detail where device
_id REGEXP '^[a-zA-Z]{4,}$';
+------------+--+
| fdsfagwe |
数学运算
## + - * /
注意:
hive中最高精度的数据类型是 double,只精确到小数点后16位,在做除法运算的时候要特别注意
select user
_id,100-price,sales-1,price*sales,price/sales,
ceil(28.0/6.999999999999999999999),ceil(28.0/6.99999999999999),
price%sales from order_detail;
+----------+---------------------
+------+---------------------
+---------------------+------
+------+---------------------
+--+
| zhangsa | 32.900000000000006 | 1 | 134.2 | 33.55 | 4 | 5 | 1.0999999999999943 |
| lisi | 56.68 | 0 | 43.32 | 43.32 | 4 | 5 | 0.3200000000000003 |
| wanger | 11.120000000000005 | 2 | 266.64 | 29.626666666666665 | 4 | 5 | 1.8799999999999955 |
| liiu | 34.0 | 0 | 66.0 | 66.0 | 4 | 5 | 0.0 |
| qibaqiu | 45.68 | 0 | 54.32 | 54.32 | 4 | 5 | 0.3200000000000003 |
| wangshi | 22.230000000000004 | 1 | 155.54 | 38.885 | 4 | 5 | 1.769999999999996 |
| liwei | 11.560000000000002 | 2 | 265.32 | 29.48 | 4 | 5 | 1.4399999999999977 |
| wutong | 43.45 | 5 | 339.29999999999995 | 9.424999999999999 | 4 | 5 | 2.549999999999997 |
| lilisi | 11.120000000000005 | 4 | 444.4 | 17.776 | 4 | 5 | 3.8799999999999955 |
## %
注意:
精度在 hive 中是个很大的问题,类似这样的操作最好通过 round 指定精度
select 8.4 % 4,round(8.4 % 4 , 2) from order
_detail limit 1;
+----------------------+------
+--+
## 位与& 位或| 位异或^ 位取反~
select 4&6, 8&4, 4|8,6|8,4^8,6^4,~6,~3 from order
_detail limit 1;
逻辑运算
## 逻辑与AND 逻辑或OR 逻辑非NOT
注意:优先级一次为NOT AND OR
## 以下两条SQL互斥
select user
_id from order_detail where not ((user
_id='wanger' or user_id like
'li%') and user
_type='old');
+----------+--+
| zhangsa |
| wanger |
| liiu |
| qibaqiu |
| wangshi |
| wutong |
| lilisi |
select user
_id from order_detail where ((user
_id='wanger' or user_id like
'li%') and user
_type='old');
+----------+--+
| lisi |
数值计算函数
## 取整: round
语法: round(double a)
说明: 遵循四舍五入
## 指定精度取整: round
语法: round(double a, int d)
## 向下取整: floor
说明: 返回等于或者小于该 double 变量的最大的整数
## 向上取整: ceil
说明: 返回等于或者大于该 double 变量的最小的整数
## 向上取整: ceiling
说明: 与ceil功能相同
select user
_id,price,round(price),round(price,0),round(price,1),floor(price),ceil(price),ceiling(price) from order_detail;
+----------+--------
+-------+-------
+-------+------
+------+------
+--+
| zhangsa | 67.1 | 67.0 | 67.0 | 67.1 | 67 | 68 | 68 |
| lisi | 43.32 | 43.0 | 43.0 | 43.3 | 43 | 44 | 44 |
| wanger | 88.88 | 89.0 | 89.0 | 88.9 | 88 | 89 | 89 |
| liiu | 66.0 | 66.0 | 66.0 | 66.0 | 66 | 66 | 66 |
| qibaqiu | 54.32 | 54.0 | 54.0 | 54.3 | 54 | 55 | 55 |
| wangshi | 77.77 | 78.0 | 78.0 | 77.8 | 77 | 78 | 78 |
| liwei | 88.44 | 88.0 | 88.0 | 88.4 | 88 | 89 | 89 |
| wutong | 56.55 | 57.0 | 57.0 | 56.6 | 56 | 57 | 57 |
| lilisi | 88.88 | 89.0 | 89.0 | 88.9 | 88 | 89 | 89 |
## 取随机数: rand
说明: 返回一个 0 到 1 范围内的随机数。如果指定种子 seed(整数),则会得到一个稳定的随机数序列。
## 自然指数: exp 自然对数: ln
select user
_id,sales,price,rand(),rand(sales),exp(sales),ln(price) from order_detail;
+----------+--------
+--------+----------------------
+----------------------+---------------------
+---------------------+--+
| zhangsa | 2 | 67.1 | 0.708066911383928 | 0.7311469360199058 | 7.38905609893065 | 4.206184043977636 |
| lisi | 1 | 43.32 | 0.509552420396184 | 0.9014476240300544 | 2.718281828459045 | 3.76861442213279 |
| wanger | 3 | 88.88 | 0.2462642074705902 | 0.49682259343089075 | 20.085536923187668 | 4.487287145331375 |
| liiu | 1 | 66.0 | 0.720800913818751 | 0.9858769332362016 | 2.718281828459045 | 4.189654742026425 |
| qibaqiu | 1 | 54.32 | 0.7358273250797408 | 0.8571240443456863 | 2.718281828459045 | 3.9948924832504407 |
| wangshi | 2 | 77.77 | 0.11092554405907218 | 0.9874208338984266 | 7.38905609893065 | 4.353755752706852 |
| liwei | 3 | 88.44 | 0.5162574691353392 | 0.2281579303734177 | 20.085536923187668 | 4.482324355989245 |
| wutong | 6 | 56.55 | 0.2753658209591686 | 0.07479382813444624 | 403.4287934927351 | 4.03512520256213 |
| lilisi | 5 | 88.88 | 0.25777632824045826 | 0.7431577182910525 | 148.4131591025766 | 4.487287145331375 |
## 以10为底对数: log10 以2为底对数: log2
## 对数: log
语法: log(double base, double a)
说明: 返回以 base 为底的 a 的对数
select log10(100),log2(8),log(4,256) from order
_detail limit 1;
+------+------
+------+--+
## 幂运算: pow power 开平方: sqrt
select pow(2,4), power(2,4),sqrt(16) from order
_detail limit 1;
+-------+-------
+------+--+
## 二进制: bin 十六进制: hex 反转十六进制: unhex
## 进制转换: conv
语法: conv(BIGINT num, int from
_base, int to_base)
说明: 将数值 num 从 from
_base 进制转化到 to_base 进制
select bin(7),hex(
'19'),hex(
'abc'),unhex(
'616263'),unhex(
'41'),conv(17,10,16),conv(17,10,2) from order
_detail limit 1;
+------+-------
+---------+------
+------+------
+--------+--+
## 绝对值:abs 正取余:pmod 正弦:sin 反正弦:asin 余弦:cos 反余弦:acos 返回A的值:positive 返回A的相反数:negative
select abs(-13),abs(10.10),pmod(9,4),pmod(-9,4),
sin(0.8),asin(0.7173560908995228),cos(0.9), acos(0.6216099682706644),
positive(-10),negative(-10)
from order
_detail limit 1;
+------+-------
+------+------
+---------------------+------
+---------------------+------
+------+------
+--+
日期函数
## UNIX时间戳转日期: from
_unixtime
## 日期转UNIX时间戳,指定格式日期转UNIX 时间戳,获取当前UNIX时间戳: unix
_timestamp
说明: 转换格式为"yyyy-MM-dd HH:mm:ss"的日期到 UNIX 时间戳。如果转化失败,则返回 0。
select
from_unixtime(1323308943),
from_unixtime(1323308943,'yyyyMMdd'),
unix_timestamp(),
unix_timestamp('2017-12-07 16:01:03'),
unix_timestamp('20171207 16-01-03','yyyyMMdd HH-mm-ss')
from
## 日期时间转日期:to
_date 日期转年:year 日期转月:month 日期转天:day 日期转小时:hour 日期转分钟:minute 日期转秒:second
select
to
_date('2016-12-08 10:03:01'),
year('2016-12-08 10:03:01'),
month('2016-12-08'),
day('2016-12-08 10:03:01'),
hour('2016-12-08 10:03:01'),
minute('2016-12-08 10:03:01'),
second('2016-12-08 10:03:01')
from
order_detail limit 1;
+-------------+-------
+------+------
+------+------
+------+--+
## 日期转周:weekofyear 日期比较:datediff
select
weekofyear(
'2016-12-08 10:03:01'),
datediff(
'2016-12-08',
'2016-11-27')
from order
_detail limit 1;
+------+------
+--+
## 日期增加: date
_add 日期减少: date_sub
select date
_add('2016-12-08',10),date_add(
'2016-12-08',-10),
date
_sub('2016-12-08',-10),date_sub(
'2016-12-08',10) from order
_detail limit 1;
+-------------+-------------
+-------------+-------------
+--+
select
date
_add('20161208',10),
from_unixtime(unix
_timestamp(date_add(
'2016-12-08',10)),
'yyyyMMdd'),
from
_unixtime(unix_timestamp(date
_add('2016-12-08',10),'yyyy-MM-dd'),'yyyyMMdd')
from order_detail limit 1;
+-------+-------
+-----------+--+
条件函数
##
IF CASE COALESCE
说明:
COALESCE返回参数中的第一个非空值;如果所有值都为
NULL,那么返回
NULL
select user_id,device_id,user_type,sales,
if(user_type=
'new',user_id,
'***'),
COALESCE(null,user_id,device_id,user_type),
COALESCE(null,null,device_id,user_type),
case user_type when
'new' then
'new_user' when
'old' then
'old_user'
else
'others' end,
case when user_type=
'new' and sales>=
5 then
'gold_user' when user_type=
'old' and sales<
3 then
'bronze_user'
else
'silver_user' end
from order_detail;
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
| user_id | device_id | user_type | sales | _c4 | _c5 | _c6 | _c7 | _c8 |
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
| zhangsa | dfsadsa323 | new | 2 | zhangsa | zhangsa | dfsadsa323 | new_user | silver_user |
| lisi | 543gfd
| old | 1 | ***
| lisi | 543gfd
| old_user | bronze_user |
| wanger | 65ghf
| new | 3 | wanger | wanger | 65ghf
| new_user | silver_user |
| liiu | fdsfagwe | new | 1 | liiu | liiu | fdsfagwe | new_user | silver_user |
| qibaqiu | fds | new | 1 | qibaqiu | qibaqiu | fds | new_user | silver_user |
| wangshi | f332 | old | 2 | ***
| wangshi | f332 | old_user | bronze_user |
| liwei | hfd | old | 3 | ***
| liwei | hfd | old_user | silver_user |
| wutong | 543gdfsd
| new | 6 | wutong | wutong | 543gdfsd
| new_user | gold_user |
| lilisi | dsfgg | new | 5 | lilisi | lilisi | dsfgg | new_user | gold_user |
| qishili | fds | new | 5 | qishili | qishili | fds | new_user | gold_user |
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
字符串函数
## 字符串长度:length 字符串反转:reverse 字符串连接:concat 带分隔符字符串连接:concat
_ws
select
user
_id,device_id,user
_type,length(user_id),reverse(user
_id),
concat(user_id,device
_id,user_type),concat
_ws('_',user
_id,device_id,user
_type)
from order_detail;
+----------+-------------
+------------+------
+----------+-----------------------
+-------------------------+--+
| zhangsa | dfsadsa323 | new | 7 | asgnahz | zhangsadfsadsa323new | zhangsa
_dfsadsa323_new |
| lisi | 543gfd | old | 4 | isil | lisi543gfdold | lisi
_543gfd_old |
| wanger | 65ghf | new | 6 | regnaw | wanger65ghfnew | wanger
_65ghf_new |
| liiu | fdsfagwe | new | 4 | uiil | liiufdsfagwenew | liiu
_fdsfagwe_new |
| qibaqiu | fds | new | 7 | uiqabiq | qibaqiufdsnew | qibaqiu
_fds_new |
| wangshi | f332 | old | 7 | ihsgnaw | wangshif332old | wangshi
_f332_old |
| liwei | hfd | old | 5 | iewil | liweihfdold | liwei
_hfd_old |
| wutong | 543gdfsd | new | 6 | gnotuw | wutong543gdfsdnew | wutong
_543gdfsd_new |
| lilisi | dsfgg | new | 6 | isilil | lilisidsfggnew | lilisi
_dsfgg_new |
| qishili | fds | new | 7 | ilihsiq | qishilifdsnew | qishili
_fds_new |
## 字符串截取函数: substr,substring
语法: substr(string A, int start),substring(string A, int start)
说明:返回字符串 A 从 start 位置到结尾的字符串
语法: substr(string A, int start, int len),substring(string A, int start, int len)
说明:返回字符串A从start位置开始,长度为len的字符串
select
user
_id,substr(user_id,3),substr(user
_id,-2),
substring(user_id,1,2),substr(user
_id,-2,2)
from order_detail;
+----------+--------
+------+------
+------+--+
| zhangsa | angsa | sa | zh | sa |
| lisi | si | si | li | si |
| wanger | nger | er | wa | er |
| liiu | iu | iu | li | iu |
| qibaqiu | baqiu | iu | qi | iu |
| wangshi | ngshi | hi | wa | hi |
| liwei | wei | ei | li | ei |
| wutong | tong | ng | wu | ng |
| lilisi | lisi | si | li | si |
## 字符串转大写:upper,ucase 字符串转小写:lower,lcase
select user
_id,upper(user_id),ucase(user
_id),
lower(upper(user_id)),lcase(ucase(user
_id)),lower(ucase(user_id))
from order
_detail;
+----------+----------
+----------+----------
+----------+----------
+--+
| zhangsa | ZHANGSA | ZHANGSA | zhangsa | zhangsa | zhangsa |
| lisi | LISI | LISI | lisi | lisi | lisi |
| wanger | WANGER | WANGER | wanger | wanger | wanger |
| liiu | LIIU | LIIU | liiu | liiu | liiu |
| qibaqiu | QIBAQIU | QIBAQIU | qibaqiu | qibaqiu | qibaqiu |
| wangshi | WANGSHI | WANGSHI | wangshi | wangshi | wangshi |
| liwei | LIWEI | LIWEI | liwei | liwei | liwei |
| wutong | WUTONG | WUTONG | wutong | wutong | wutong |
| lilisi | LILISI | LILISI | lilisi | lilisi | lilisi |
## 去两边的空格:trim 左边去空格:ltrim 右边去空格:rtrim
select trim(' abc
'),ltrim(' abc'),rtrim(
'abc ') from order
_detail limit 1;
+------+------
+------+--+
## 正则表达式替换: regexp
_replace
说明:将字符串 A 中的符合 java 正则表达式 B 的部分替换为 C。注意,在有些情况下要使用转义字符,
类似 oracle 中的 regexp_replace 函数。
## 正则表达式解析: regexp
_extract
将字符串 subject 按照 pattern 正则表达式的规则拆分,返回 index 指定的字符。
注意,在有些情况下要使用转义字符,如等号要用双竖线转义,这是java正则表达式的规则。
select user
_id,regexp_replace(user
_id, 'li|ng', '**'),
regexp_extract(user
_id,'li(.*?)(si)',1),
regexp_extract(user
_id,'li(.*?)(si)',2),
regexp_extract(user
_id,'li(.*?)(si)',0)
from order_detail;
+----------+----------
+------+------
+---------+--+
| zhangsa | zha*
*sa | | | |
| lisi | **si | | si | lisi |
| wanger | wa**er | | | |
| liiu | **iu | | | |
| qibaqiu | qibaqiu | | | |
| wangshi | wa**shi | | | |
| liwei | **wei | | | |
| wutong | wuto** | | | |
| lilisi | ***
*si | li | si | lilisi |
| qishili | qishi** | | | |
+----------+----------
+------+------
+---------+--+
select
regexp
_extract('http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1', '.*?k1\\=([^&]+)', 1),
regexp_extract(
'http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1',
'.*?k2\\=([^#]+)', 1)
from order
_detail limit 1;
+-----------+--------
+--+
## URL解析:parse
_url
语法: parse_url(string urlString, string partToExtract [, string keyToExtract])
说明:返回 URL 中指定的部分。
partToExtract 的有效值为: HOST, PATH, QUERY, REF,PROTOCOL, AUTHORITY, FILE, and USERINFO.
举例:
select
parse
_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'),
parse_url(
'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1',
'PATH'),
parse
_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY'),
parse_url(
'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1',
'QUERY',
'k2'),
parse
_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF'),
parse_url(
'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1',
'PROTOCOL'),
parse
_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY'),
parse_url(
'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1',
'FILE')
from order
_detail limit 1;
+---------------+---------------
+--------------+------
+-------+-------
+---------------+---------------------------
+--+
## json解析:
get
_json_object
语法: get
_json_object(string json
_string, string path)
说明:解析 json 的字符串 json_string,返回 path 指定的内容。如果输入的 json 字符串无效,那么返回 NULL。
select
get
_json_object(
'{"store":
{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for_json_udf_test.net",
"owner":"amy"
}',
'$.owner'),
get
_json_object(
'{"store":
{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for_json_udf_test.net",
"owner":"amy"
}',
'$.store.fruit[0].type')
from order
_detail limit 1;
+------+--------
+--+
## json
_tuple
语法: json_tuple(string jsonStr,string k1,string k2, ...)
参数为一组键k1,k2……和JSON字符串,返回值的元组。该方法比 get
_json_object 高效,因为可以在一次调用中输入多个键.
select a.user
_id, b.*
from order_detail a
lateral view
json
_tuple('{"store":
{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for
_json_udf
_test.net",
"owner":"amy"
}', 'email', 'owner') b as email, owner limit 1;
+----------+---------------------------------
+--------+--+
## parse
_url_tuple
SELECT b.*
from
(
select
'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' as urlstr
from
order
_detail
limit 1
)a
LATERAL VIEW
parse_url
_tuple(a.urlstr, 'HOST', 'PATH', 'QUERY', 'QUERY:k1') b as host, path, query, query_k1 LIMIT 1;
+---------------+---------------
+--------------+-----------
+--+
空格字符串:space 重复字符串:repeat 首字符ascii:ascii
select space(10), length(space(10)), repeat(
'abc',5), ascii(
'abcde')
from order
_detail
limit 1;
+-------------+------
+------------------+------
+--+
左补足函数:lpad 右补足函数:rpad
语法: lpad(string str, int len, string pad)
说明:lpad将 str 进行用 pad 进行左补足到 len 位, rpad将 str 进行用 pad 进行右补足到 len 位
注意:与 GP,ORACLE 不同; pad不能默认
select lpad(
'abc',10,
'td'),rpad(
'abc',10,
'td')
from order
_detail
limit 1;
+-------------+-------------
+--+
分割字符串函数: split
集合查找函数: find
_in_set
语法: find
_in_set(string str, string strList)
说明: 返回 str 在 strlist 第一次出现的位置, strlist 是用逗号分割的字符串。如果没有找该 str 字符,则返回 0
select
split(
'abtcdtef',
't'),
find
_in_set(
'ab',
'ef,ab,de'),
find
_in_set(
'at',
'ef,ab,de')
from order
_detail
limit 1;
+-------------------+------+------+--+
| _c0 |
_c1 | _c2 |
+-------------------+------
+------+--+
| ["ab","cd","ef"] | 2 | 0 |
+-------------------+------
+------+--+
## string转map:str
_to_map
语法:str
_to_map(text[, delimiter1, delimiter2])
说明:使用两个分隔符将文本拆分为键值对。 Delimiter1将文本分成K-V对,Delimiter2分割每个K-V对。
对于delimiter1默认分隔符是
',',对于delimiter2默认分隔符是
':'。
select str
_to_map(
'aaa:11&bbb:22',
'&',
':')
select str
_to_map(
'aaa:11&bbb:22',
'&',
':')[
'aaa']
from order
_detail limit 1;
+------+--+
select str
_to_map(
'aaa:11,bbb:22')
from person limit 1;
{"bbb":"22","aaa":"11"}
集合统计函数
## 个数统计:count 总和统计:sum
语法: count(
*), count(expr), count(DISTINCT expr[, expr_.])
说明:
count(*)统计检索出的行的个数,包括 NULL 值的行;
count(expr)返回指定字段的非空值的个数;
count(DISTINCT expr[, expr
_.])返回指定字段的不同的非空值的个数
语法: sum(col), sum(DISTINCT col)
说明:
sum(col)统计结果集中 col 的相加的结果; sum(DISTINCT col)统计结果中 col 不同值
select
count(
*),count(user_type),count(distinct user_type),
sum(sales),sum(distinct sales)
from order_detail;
+------+------
+------+------
+------+--+
## 平均值统计:avg 最小值统计:min 最大值统计:max
select
avg(sales),avg(distinct sales),min(sales),max(distinct sales)
from order
_detail;
+------+------
+------+------
+--+
## 标准差:stddev
_samp, stddev, stddev_pop
stddev
_pop <==> stddev
## 方差:var
_samp, var_pop
当我们需要真实的标准差/方差的时候最好是使用: stddev stddev
_pop var_pop
而只是需要得到少量数据的标准差/方差的近似值可以选用: stddev
_samp var_samp
select
var
_pop(sales),var_samp(sales),
stddev
_pop(sales),stddev(sales),stddev_samp(sales),
pow(stddev
_pop(sales),2),pow(stddev_samp(sales),2)
from order
_detail;
+-------+--------------------
+---------------------+---------------------
+---------------------+---------------------
+---------------------+--+
## 百分位数: percentile 近似百分位数: percentile
_approx 直方图: histogram_numeric
语法: percentile
_approx(DOUBLE col, p [, B])
返回值: double
说明: 求近似的第 pth 个百分位数, p 必须介于 0 和 1 之间,返回类型为 double,但是col 字段支持浮点类型。
参数 B 控制内存消耗的近似精度, B越大,结果的准确度越高。默认为 10,000。
当 col 字段中的 distinct 值的个数小于 B 时,结果为准确的百分位数
select
percentile(sales,0.2),
percentile
_approx(sales,0.5,10000),
histogram_numeric(sales,5)
from order
_detail;
+------+------
+----------------------------------------------------------------------------------------------+--+
| 1.0 | 2.0 | [{"x":1.0,"y":3.0},{"x":2.0,"y":2.0},{"x":3.0,"y":2.0},{"x":5.0,"y":2.0},{"x":6.0,"y":1.0}] |
+------+------
+----------------------------------------------------------------------------------------------+--+
select
inline(histogram
_numeric(sales,10))
from order_detail;
+------+------
+--+
| 1.0 | 3.0 |
| 2.0 | 2.0 |
| 3.0 | 2.0 |
| 5.0 | 2.0 |
### 后面可以输入多个百分位数,返回类型也为 array<double>,其中为对应的百分位数。
select
percentile(sales,array(0.2,0.4,0.6)),
percentile
_approx(sales,array(0.2,0.4,0.6),10000)
from order_detail;
+----------------+----------------
+--+
| [1.0,2.0,3.0] | [1.0,1.5,2.5] |
+----------------+----------------
+--+
复杂类型访问操作及统计函数
测试数据集:
tony 1338 hello,woddd 1,2 a1,a2,a3 k1:1.0,k2:2.0,k3:3.0 s1,s2,s3,4
mark 5453 kke,ladyg 2,3 a4,a5,a6 k4:4.0,k5:5.0,k2:6.0 s4,s5,s6,6
ivyfd 4323 aa,thq,dsx 3,6 a7,a8,a9 k7:7.0,k8:8.0,k2:9.0 s7,s8,s9,9
drop table employees;
create external table if not exists employees(
name string,
salary string,
happy
_word string,
happy_num array<int>,
subordinates array<string>,
deductions map<string,float>,
address struct<street:string,city:string,state:string,zip:int>
)
row format delimited fields terminated by
'\t'
collection items terminated by
','
map keys terminated by
':'
lines terminated by
'\n'
stored as textfile;
hdfs dfs -put /home/liguodong/data/muldata.txt /temp/lgd
load data inpath
'/temp/lgd/muldata.txt' overwrite into table employees;
select * from employees;
| tony | 1338 | hello,woddd | [1,2] | ["a1","a2","a3"] | {"k1":1.0,"k2":2.0,"k3":3.0} | {"street":"s1","city":"s2","state":"s3","zip":4} |
| mark | 5453 | kke,ladyg | [2,3] | ["a4","a5","a6"] | {"k4":4.0,"k5":5.0,"k2":6.0} | {"street":"s4","city":"s5","state":"s6","zip":6} |
| ivyfd | 4323 | aa,thq,dsx | [3,6] | ["a7","a8","a9"] | {"k7":7.0,"k8":8.0,"k2":9.0} | {"street":"s7","city":"s8","state":"s9","zip":9} |
+--------+---------
+--------------+------------
+-------------------+-------------------------------
+---------------------------------------------------+--+
## 访问数组 Map 结构体
select
name,salary,
subordinates[1],deductions[
'k2'],deductions[
'k3'],address.city
| tony | 1338 | a2 | 2.0 | 3.0 | s2 |
| mark | 5453 | a5 | 6.0 | NULL | s5 |
## Map类型长度 Array类型长度
select size(deductions),size(subordinates) from employees limit 1;
+------+------
+--+
## 类型转换: cast
select cast(salary as int),cast(deductions[
'k2'] as bigint) from employees;
+---------+------
+--+
| 1338 | 2 |
| 5453 | 6 |
### LATERAL VIEW 行转列
SELECT
name, ad
_subordinate
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate;
+--------+-----------------
+--+
| tony | a1 |
| tony | a2 |
| tony | a3 |
| mark | a4 |
| mark | a5 |
| mark | a6 |
| ivyfd | a7 |
| ivyfd | a8 |
SELECT
name, count(1)
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad
_subordinate
group by name;
+--------+------+--+
| name | _c1 |
+--------+------
+--+
| ivyfd | 3 |
| mark | 3 |
SELECT ad
_subordinate, ad_num
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad
_subordinate
LATERAL VIEW explode(happy_num) addTable2 AS ad
_num;
+-----------------+---------+--+
| ad_subordinate | ad
_num |
+-----------------+---------+--+
| a1 | 1 |
| a1 | 2 |
| a2 | 1 |
| a2 | 2 |
| a3 | 1 |
| a3 | 2 |
| a4 | 2 |
| a4 | 3 |
| a5 | 2 |
| a5 | 3 |
| a6 | 2 |
| a6 | 3 |
| a7 | 3 |
| a7 | 6 |
| a8 | 3 |
| a8 | 6 |
| a9 | 3 |
| a9 | 6 |
+-----------------+---------+--+
### 多个LATERAL VIEW
SELECT
name, count(1)
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad
_subordinate
LATERAL VIEW explode(happy_num) addTable2 AS ad
_num
group by name;
+--------+------
+--+
| ivyfd | 6 |
| mark | 6 |
### 不满足条件产生空行
SELECT AA.name, BB.* FROM employees AA
LATERAL VIEW
+-------+----
+--+
### OUTER 避免永远不产生结果,无满足条件的行,在该列会产生NULL值。
SELECT AA.name, BB.* FROM employees AA
LATERAL VIEW
| tony | NULL |
| mark | NULL |
### 字符串切分成多列
SELECT
name, word
FROM employees
LATERAL VIEW explode(split(happy
_word,',')) addTable AS word;
+--------+--------
+--+
| tony | hello |
| tony | woddd |
| mark | kke |
| mark | ladyg |
| ivyfd | aa |
| ivyfd | thq |