Hive常用函数大全(一)(关系数学逻辑数值日期条件字符串集合统计复杂类型)

    xiaoxiao2021-03-25  118

    测试数据集:

    create external table if not exists order_detail( user_id string, device_id string, user_type string, price double, sales int ) row format delimited fields terminated by '\t' lines terminated by '\n' stored as textfile; hdfs dfs -put /home/liguodong/data/data.text /temp/lgd load data inpath '/temp/lgd/data.text' overwrite into table order_detail; select * from order_detail; +----------+-------------+------------+--------+--------+--+ | user_id | device_id | user_type | price | sales | +----------+-------------+------------+--------+--------+--+ | zhangsa | dfsadsa323 | new | 67.1 | 2 | | lisi | 543gfd | old | 43.32 | 1 | | wanger | 65ghf | new | 88.88 | 3 | | liiu | fdsfagwe | new | 66.0 | 1 | | qibaqiu | fds | new | 54.32 | 1 | | wangshi | f332 | old | 77.77 | 2 | | liwei | hfd | old | 88.44 | 3 | | wutong | 543gdfsd | new | 56.55 | 6 | | lilisi | dsfgg | new | 88.88 | 5 | | qishili | fds | new | 66.66 | 5 | +----------+-------------+------------+--------+--------+--+

    关系运算

    ## > < = ##注意: String 的比较要注意(常用的时间比较可以先 to_date 之后再比较) select long_time>short_time, long_time<short_time,long_time=short_time, to_date(long_time)=to_date(short_time) from ( select '2017-01-11 00:00:00' as long_time, '2017-01-11' as short_time from order_detail limit 1 )bb; result: true false false true ## 空值判断 select 1 from order_detail where NULL is NULL limit 1; ## 非空判断 select 1 from order_detail where 1 is not NULL limit 1; ## LIKE 语法: A LIKE B 描述: 字符串A符合表达式B的正则语法,则为TRUE;否则为FALSE. B中字符”_”表示任意单个字符,而字符”%”表示任意数量的字符。 select user_id from order_detail where user_id like 'li%'; +----------+--+ | user_id | +----------+--+ | lisi | | liiu | | liwei | | lilisi | +----------+--+ select user_id from order_detail where user_id like 'li__'; +----------+--+ | user_id | +----------+--+ | lisi | | liiu | +----------+--+ ## RLIKE 语法: A RLIKE B 描述: 字符串A符合JAVA正则表达式 B 的正则语法,则为 TRUE;否则为 FALSE。 select user_id from order_detail where user_id rlike '^l.*i$'; +----------+--+ | user_id | +----------+--+ | lisi | | liwei | | lilisi | +----------+--+ select device_id from order_detail where device_id rlike '^\\w+$'; +-------------+--+ | device_id | +-------------+--+ | dfsadsa323 | | 543gfd | | 65ghf | | fdsfagwe | | fds | | f332 | | hfd | | 543gdfsd | | dsfgg | | fds | +-------------+--+ select device_id from order_detail where device_id rlike '^[a-zA-Z]+$'; +------------+--+ | device_id | +------------+--+ | fdsfagwe | | fds | | hfd | | dsfgg | | fds | +------------+--+ select device_id from order_detail where device_id rlike '^[a-zA-Z]{4,}$'; +------------+--+ | device_id | +------------+--+ | fdsfagwe | | dsfgg | +------------+--+ ## REGEXP 语法: A REGEXP B 描述: 功能与 RLIKE 相同 select device_id from order_detail where device_id REGEXP '^[a-zA-Z]{4,}$'; +------------+--+ | device_id | +------------+--+ | fdsfagwe | | dsfgg | +------------+--+

    数学运算

    ## + - * / 注意: hive中最高精度的数据类型是 double,只精确到小数点后16位,在做除法运算的时候要特别注意 select user_id,100-price,sales-1,price*sales,price/sales, ceil(28.0/6.999999999999999999999),ceil(28.0/6.99999999999999), price%sales from order_detail; +----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+ | user_id | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 | +----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+ | zhangsa | 32.900000000000006 | 1 | 134.2 | 33.55 | 4 | 5 | 1.0999999999999943 | | lisi | 56.68 | 0 | 43.32 | 43.32 | 4 | 5 | 0.3200000000000003 | | wanger | 11.120000000000005 | 2 | 266.64 | 29.626666666666665 | 4 | 5 | 1.8799999999999955 | | liiu | 34.0 | 0 | 66.0 | 66.0 | 4 | 5 | 0.0 | | qibaqiu | 45.68 | 0 | 54.32 | 54.32 | 4 | 5 | 0.3200000000000003 | | wangshi | 22.230000000000004 | 1 | 155.54 | 38.885 | 4 | 5 | 1.769999999999996 | | liwei | 11.560000000000002 | 2 | 265.32 | 29.48 | 4 | 5 | 1.4399999999999977 | | wutong | 43.45 | 5 | 339.29999999999995 | 9.424999999999999 | 4 | 5 | 2.549999999999997 | | lilisi | 11.120000000000005 | 4 | 444.4 | 17.776 | 4 | 5 | 3.8799999999999955 | | qishili | 33.34 | 4 | 333.29999999999995 | 13.331999999999999 | 4 | 5 | 1.6599999999999966 | +----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+ ## % 注意: 精度在 hive 中是个很大的问题,类似这样的操作最好通过 round 指定精度 select 8.4 % 4,round(8.4 % 4 , 2) from order_detail limit 1; +----------------------+------+--+ | _c0 | _c1 | +----------------------+------+--+ | 0.40000000000000036 | 0.4 | +----------------------+------+--+ ## 位与& 位或| 位异或^ 位取反~ select 4&6, 8&4, 4|8,6|8,4^8,6^4,~6,~3 from order_detail limit 1;

    逻辑运算

    ## 逻辑与AND 逻辑或OR 逻辑非NOT 注意:优先级一次为NOT AND OR ## 以下两条SQL互斥 select user_id from order_detail where not ((user_id='wanger' or user_id like 'li%') and user_type='old'); +----------+--+ | user_id | +----------+--+ | zhangsa | | wanger | | liiu | | qibaqiu | | wangshi | | wutong | | lilisi | | qishili | +----------+--+ select user_id from order_detail where ((user_id='wanger' or user_id like 'li%') and user_type='old'); +----------+--+ | user_id | +----------+--+ | lisi | | liwei | +----------+--+

    数值计算函数

    ## 取整: round 语法: round(double a) 说明: 遵循四舍五入 ## 指定精度取整: round 语法: round(double a, int d) ## 向下取整: floor 说明: 返回等于或者小于该 double 变量的最大的整数 ## 向上取整: ceil 说明: 返回等于或者大于该 double 变量的最小的整数 ## 向上取整: ceiling 说明: 与ceil功能相同 select user_id,price,round(price),round(price,0),round(price,1),floor(price),ceil(price),ceiling(price) from order_detail; +----------+--------+-------+-------+-------+------+------+------+--+ | user_id | price | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 | +----------+--------+-------+-------+-------+------+------+------+--+ | zhangsa | 67.1 | 67.0 | 67.0 | 67.1 | 67 | 68 | 68 | | lisi | 43.32 | 43.0 | 43.0 | 43.3 | 43 | 44 | 44 | | wanger | 88.88 | 89.0 | 89.0 | 88.9 | 88 | 89 | 89 | | liiu | 66.0 | 66.0 | 66.0 | 66.0 | 66 | 66 | 66 | | qibaqiu | 54.32 | 54.0 | 54.0 | 54.3 | 54 | 55 | 55 | | wangshi | 77.77 | 78.0 | 78.0 | 77.8 | 77 | 78 | 78 | | liwei | 88.44 | 88.0 | 88.0 | 88.4 | 88 | 89 | 89 | | wutong | 56.55 | 57.0 | 57.0 | 56.6 | 56 | 57 | 57 | | lilisi | 88.88 | 89.0 | 89.0 | 88.9 | 88 | 89 | 89 | | qishili | 66.66 | 67.0 | 67.0 | 66.7 | 66 | 67 | 67 | +----------+--------+-------+-------+-------+------+------+------+--+ ## 取随机数: rand 说明: 返回一个 0 到 1 范围内的随机数。如果指定种子 seed(整数),则会得到一个稳定的随机数序列。 ## 自然指数: exp 自然对数: ln select user_id,sales,price,rand(),rand(sales),exp(sales),ln(price) from order_detail; +----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+ | user_id | sales | price | _c3 | _c4 | _c5 | _c6 | +----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+ | zhangsa | 2 | 67.1 | 0.708066911383928 | 0.7311469360199058 | 7.38905609893065 | 4.206184043977636 | | lisi | 1 | 43.32 | 0.509552420396184 | 0.9014476240300544 | 2.718281828459045 | 3.76861442213279 | | wanger | 3 | 88.88 | 0.2462642074705902 | 0.49682259343089075 | 20.085536923187668 | 4.487287145331375 | | liiu | 1 | 66.0 | 0.720800913818751 | 0.9858769332362016 | 2.718281828459045 | 4.189654742026425 | | qibaqiu | 1 | 54.32 | 0.7358273250797408 | 0.8571240443456863 | 2.718281828459045 | 3.9948924832504407 | | wangshi | 2 | 77.77 | 0.11092554405907218 | 0.9874208338984266 | 7.38905609893065 | 4.353755752706852 | | liwei | 3 | 88.44 | 0.5162574691353392 | 0.2281579303734177 | 20.085536923187668 | 4.482324355989245 | | wutong | 6 | 56.55 | 0.2753658209591686 | 0.07479382813444624 | 403.4287934927351 | 4.03512520256213 | | lilisi | 5 | 88.88 | 0.25777632824045826 | 0.7431577182910525 | 148.4131591025766 | 4.487287145331375 | | qishili | 5 | 66.66 | 0.06419187859857822 | 0.9495832704567262 | 148.4131591025766 | 4.199605072879594 | +----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+ ## 以10为底对数: log10 以2为底对数: log2 ## 对数: log 语法: log(double base, double a) 说明: 返回以 base 为底的 a 的对数 select log10(100),log2(8),log(4,256) from order_detail limit 1; +------+------+------+--+ | _c0 | _c1 | _c2 | +------+------+------+--+ | 2.0 | 3.0 | 4.0 | +------+------+------+--+ ## 幂运算: pow power 开平方: sqrt select pow(2,4), power(2,4),sqrt(16) from order_detail limit 1; +-------+-------+------+--+ | _c0 | _c1 | _c2 | +-------+-------+------+--+ | 16.0 | 16.0 | 4.0 | +-------+-------+------+--+ ## 二进制: bin 十六进制: hex 反转十六进制: unhex ## 进制转换: conv 语法: conv(BIGINT num, int from_base, int to_base) 说明: 将数值 num 从 from_base 进制转化到 to_base 进制 select bin(7),hex('19'),hex('abc'),unhex('616263'),unhex('41'),conv(17,10,16),conv(17,10,2) from order_detail limit 1; +------+-------+---------+------+------+------+--------+--+ | _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | +------+-------+---------+------+------+------+--------+--+ | 111 | 3139 | 616263 | abc | A | 11 | 10001 | +------+-------+---------+------+------+------+--------+--+ ## 绝对值:abs 正取余:pmod 正弦:sin 反正弦:asin 余弦:cos 反余弦:acos 返回A的值:positive 返回A的相反数:negative select abs(-13),abs(10.10),pmod(9,4),pmod(-9,4), sin(0.8),asin(0.7173560908995228),cos(0.9), acos(0.6216099682706644), positive(-10),negative(-10) from order_detail limit 1; +------+-------+------+------+---------------------+------+---------------------+------+------+------+--+ | _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 | _c8 | _c9 | +------+-------+------+------+---------------------+------+---------------------+------+------+------+--+ | 13 | 10.1 | 1 | 3 | 0.7173560908995228 | 0.8 | 0.6216099682706644 | 0.9 | -10 | 10 | +------+-------+------+------+---------------------+------+---------------------+------+------+------+--+

    日期函数

    ## UNIX时间戳转日期: from_unixtime ## 日期转UNIX时间戳,指定格式日期转UNIX 时间戳,获取当前UNIX时间戳: unix_timestamp 说明: 转换格式为"yyyy-MM-dd HH:mm:ss"的日期到 UNIX 时间戳。如果转化失败,则返回 0。 select from_unixtime(1323308943), from_unixtime(1323308943,'yyyyMMdd'), unix_timestamp(), unix_timestamp('2017-12-07 16:01:03'), unix_timestamp('20171207 16-01-03','yyyyMMdd HH-mm-ss') from order_detail limit 1; +----------------------+-----------+-------------+-------------+-------------+--+ | _c0 | _c1 | _c2 | _c3 | _c4 | +----------------------+-----------+-------------+-------------+-------------+--+ | 2011-12-08 09:49:03 | 20111208 | 1489029488 | 1512633663 | 1512633663 | +----------------------+-----------+-------------+-------------+-------------+--+ ## 日期时间转日期:to_date 日期转年:year 日期转月:month 日期转天:day 日期转小时:hour 日期转分钟:minute 日期转秒:second select to_date('2016-12-08 10:03:01'), year('2016-12-08 10:03:01'), month('2016-12-08'), day('2016-12-08 10:03:01'), hour('2016-12-08 10:03:01'), minute('2016-12-08 10:03:01'), second('2016-12-08 10:03:01') from order_detail limit 1; +-------------+-------+------+------+------+------+------+--+ | _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | +-------------+-------+------+------+------+------+------+--+ | 2016-12-08 | 2016 | 12 | 8 | 10 | 3 | 1 | +-------------+-------+------+------+------+------+------+--+ ## 日期转周:weekofyear 日期比较:datediff select weekofyear('2016-12-08 10:03:01'), datediff('2016-12-08','2016-11-27') from order_detail limit 1; +------+------+--+ | _c0 | _c1 | +------+------+--+ | 49 | 11 | +------+------+--+ ## 日期增加: date_add 日期减少: date_sub select date_add('2016-12-08',10),date_add('2016-12-08',-10), date_sub('2016-12-08',-10),date_sub('2016-12-08',10) from order_detail limit 1; +-------------+-------------+-------------+-------------+--+ | _c0 | _c1 | _c2 | _c3 | +-------------+-------------+-------------+-------------+--+ | 2016-12-18 | 2016-11-28 | 2016-12-18 | 2016-11-28 | +-------------+-------------+-------------+-------------+--+ select date_add('20161208',10), from_unixtime(unix_timestamp(date_add('2016-12-08',10)),'yyyyMMdd'), from_unixtime(unix_timestamp(date_add('2016-12-08',10),'yyyy-MM-dd'),'yyyyMMdd') from order_detail limit 1; +-------+-------+-----------+--+ | _c0 | _c1 | _c2 | +-------+-------+-----------+--+ | NULL | NULL | 20161218 | +-------+-------+-----------+--+

    条件函数

    ## IF CASE COALESCE 说明: COALESCE返回参数中的第一个非空值;如果所有值都为 NULL,那么返回 NULL select user_id,device_id,user_type,sales, if(user_type='new',user_id,'***'), COALESCE(null,user_id,device_id,user_type), COALESCE(null,null,device_id,user_type), case user_type when 'new' then 'new_user' when 'old' then 'old_user' else 'others' end, case when user_type='new' and sales>=5 then 'gold_user' when user_type='old' and sales<3 then 'bronze_user' else 'silver_user' end from order_detail; +----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+ | user_id | device_id | user_type | sales | _c4 | _c5 | _c6 | _c7 | _c8 | +----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+ | zhangsa | dfsadsa323 | new | 2 | zhangsa | zhangsa | dfsadsa323 | new_user | silver_user | | lisi | 543gfd | old | 1 | *** | lisi | 543gfd | old_user | bronze_user | | wanger | 65ghf | new | 3 | wanger | wanger | 65ghf | new_user | silver_user | | liiu | fdsfagwe | new | 1 | liiu | liiu | fdsfagwe | new_user | silver_user | | qibaqiu | fds | new | 1 | qibaqiu | qibaqiu | fds | new_user | silver_user | | wangshi | f332 | old | 2 | *** | wangshi | f332 | old_user | bronze_user | | liwei | hfd | old | 3 | *** | liwei | hfd | old_user | silver_user | | wutong | 543gdfsd | new | 6 | wutong | wutong | 543gdfsd | new_user | gold_user | | lilisi | dsfgg | new | 5 | lilisi | lilisi | dsfgg | new_user | gold_user | | qishili | fds | new | 5 | qishili | qishili | fds | new_user | gold_user | +----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+

    字符串函数

    ## 字符串长度:length 字符串反转:reverse 字符串连接:concat 带分隔符字符串连接:concat_ws select user_id,device_id,user_type,length(user_id),reverse(user_id), concat(user_id,device_id,user_type),concat_ws('_',user_id,device_id,user_type) from order_detail; +----------+-------------+------------+------+----------+-----------------------+-------------------------+--+ | user_id | device_id | user_type | _c3 | _c4 | _c5 | _c6 | +----------+-------------+------------+------+----------+-----------------------+-------------------------+--+ | zhangsa | dfsadsa323 | new | 7 | asgnahz | zhangsadfsadsa323new | zhangsa_dfsadsa323_new | | lisi | 543gfd | old | 4 | isil | lisi543gfdold | lisi_543gfd_old | | wanger | 65ghf | new | 6 | regnaw | wanger65ghfnew | wanger_65ghf_new | | liiu | fdsfagwe | new | 4 | uiil | liiufdsfagwenew | liiu_fdsfagwe_new | | qibaqiu | fds | new | 7 | uiqabiq | qibaqiufdsnew | qibaqiu_fds_new | | wangshi | f332 | old | 7 | ihsgnaw | wangshif332old | wangshi_f332_old | | liwei | hfd | old | 5 | iewil | liweihfdold | liwei_hfd_old | | wutong | 543gdfsd | new | 6 | gnotuw | wutong543gdfsdnew | wutong_543gdfsd_new | | lilisi | dsfgg | new | 6 | isilil | lilisidsfggnew | lilisi_dsfgg_new | | qishili | fds | new | 7 | ilihsiq | qishilifdsnew | qishili_fds_new | ## 字符串截取函数: substr,substring 语法: substr(string A, int start),substring(string A, int start) 说明:返回字符串 A 从 start 位置到结尾的字符串 语法: substr(string A, int start, int len),substring(string A, int start, int len) 说明:返回字符串A从start位置开始,长度为len的字符串 select user_id,substr(user_id,3),substr(user_id,-2), substring(user_id,1,2),substr(user_id,-2,2) from order_detail; +----------+--------+------+------+------+--+ | user_id | _c1 | _c2 | _c3 | _c4 | +----------+--------+------+------+------+--+ | zhangsa | angsa | sa | zh | sa | | lisi | si | si | li | si | | wanger | nger | er | wa | er | | liiu | iu | iu | li | iu | | qibaqiu | baqiu | iu | qi | iu | | wangshi | ngshi | hi | wa | hi | | liwei | wei | ei | li | ei | | wutong | tong | ng | wu | ng | | lilisi | lisi | si | li | si | | qishili | shili | li | qi | li | +----------+--------+------+------+------+--+ ## 字符串转大写:upper,ucase 字符串转小写:lower,lcase select user_id,upper(user_id),ucase(user_id), lower(upper(user_id)),lcase(ucase(user_id)),lower(ucase(user_id)) from order_detail; +----------+----------+----------+----------+----------+----------+--+ | user_id | _c1 | _c2 | _c3 | _c4 | _c5 | +----------+----------+----------+----------+----------+----------+--+ | zhangsa | ZHANGSA | ZHANGSA | zhangsa | zhangsa | zhangsa | | lisi | LISI | LISI | lisi | lisi | lisi | | wanger | WANGER | WANGER | wanger | wanger | wanger | | liiu | LIIU | LIIU | liiu | liiu | liiu | | qibaqiu | QIBAQIU | QIBAQIU | qibaqiu | qibaqiu | qibaqiu | | wangshi | WANGSHI | WANGSHI | wangshi | wangshi | wangshi | | liwei | LIWEI | LIWEI | liwei | liwei | liwei | | wutong | WUTONG | WUTONG | wutong | wutong | wutong | | lilisi | LILISI | LILISI | lilisi | lilisi | lilisi | | qishili | QISHILI | QISHILI | qishili | qishili | qishili | +----------+----------+----------+----------+----------+----------+--+ ## 去两边的空格:trim 左边去空格:ltrim 右边去空格:rtrim select trim(' abc '),ltrim(' abc'),rtrim('abc ') from order_detail limit 1; +------+------+------+--+ | _c0 | _c1 | _c2 | +------+------+------+--+ | abc | abc | abc | +------+------+------+--+ ## 正则表达式替换: regexp_replace 说明:将字符串 A 中的符合 java 正则表达式 B 的部分替换为 C。注意,在有些情况下要使用转义字符, 类似 oracle 中的 regexp_replace 函数。 ## 正则表达式解析: regexp_extract 将字符串 subject 按照 pattern 正则表达式的规则拆分,返回 index 指定的字符。 注意,在有些情况下要使用转义字符,如等号要用双竖线转义,这是java正则表达式的规则。 select user_id,regexp_replace(user_id, 'li|ng', '**'), regexp_extract(user_id,'li(.*?)(si)',1), regexp_extract(user_id,'li(.*?)(si)',2), regexp_extract(user_id,'li(.*?)(si)',0) from order_detail; +----------+----------+------+------+---------+--+ | user_id | _c1 | _c2 | _c3 | _c4 | +----------+----------+------+------+---------+--+ | zhangsa | zha**sa | | | | | lisi | **si | | si | lisi | | wanger | wa**er | | | | | liiu | **iu | | | | | qibaqiu | qibaqiu | | | | | wangshi | wa**shi | | | | | liwei | **wei | | | | | wutong | wuto** | | | | | lilisi | ****si | li | si | lilisi | | qishili | qishi** | | | | +----------+----------+------+------+---------+--+ select regexp_extract('http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1', '.*?k1\\=([^&]+)', 1), regexp_extract('http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1', '.*?k2\\=([^#]+)', 1) from order_detail limit 1; +-----------+--------+--+ | _c0 | _c1 | +-----------+--------+--+ | v1543643 | v3245 | +-----------+--------+--+ ## URL解析:parse_url 语法: parse_url(string urlString, string partToExtract [, string keyToExtract]) 说明:返回 URL 中指定的部分。 partToExtract 的有效值为: HOST, PATH, QUERY, REF,PROTOCOL, AUTHORITY, FILE, and USERINFO. 举例: select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY','k2'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY'), parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE') from order_detail limit 1; +---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+ | _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 | +---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+ | facebook.com | /path1/p.php | k1=v1&k2=v2 | v2 | Ref1 | http | facebook.com | /path1/p.php?k1=v1&k2=v2 | +---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+ ## json解析: get_json_object 语法: get_json_object(string json_string, string path) 说明:解析 json 的字符串 json_string,返回 path 指定的内容。如果输入的 json 字符串无效,那么返回 NULL。 select get_json_object( '{"store": {"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} }, "email":"amy@only_for_json_udf_test.net", "owner":"amy" }', '$.owner'), get_json_object( '{"store": {"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} }, "email":"amy@only_for_json_udf_test.net", "owner":"amy" }', '$.store.fruit[0].type') from order_detail limit 1; +------+--------+--+ | _c0 | _c1 | +------+--------+--+ | amy | apple | +------+--------+--+ ## json_tuple 语法: json_tuple(string jsonStr,string k1,string k2, ...) 参数为一组键k1,k2……和JSON字符串,返回值的元组。该方法比 get_json_object 高效,因为可以在一次调用中输入多个键. select a.user_id, b.* from order_detail a lateral view json_tuple('{"store": {"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} }, "email":"amy@only_for_json_udf_test.net", "owner":"amy" }', 'email', 'owner') b as email, owner limit 1; +----------+---------------------------------+--------+--+ | user_id | email | owner | +----------+---------------------------------+--------+--+ | zhangsa | amy@only_for_json_udf_test.net | amy | +----------+---------------------------------+--------+--+ ## parse_url_tuple SELECT b.* from ( select 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' as urlstr from order_detail limit 1 )a LATERAL VIEW parse_url_tuple(a.urlstr, 'HOST', 'PATH', 'QUERY', 'QUERY:k1') b as host, path, query, query_k1 LIMIT 1; +---------------+---------------+--------------+-----------+--+ | host | path | query | query_k1 | +---------------+---------------+--------------+-----------+--+ | facebook.com | /path1/p.php | k1=v1&k2=v2 | v1 | +---------------+---------------+--------------+-----------+--+ 空格字符串:space 重复字符串:repeat 首字符ascii:ascii select space(10), length(space(10)), repeat('abc',5), ascii('abcde') from order_detail limit 1; +-------------+------+------------------+------+--+ | _c0 | _c1 | _c2 | _c3 | +-------------+------+------------------+------+--+ | | 10 | abcabcabcabcabc | 97 | +-------------+------+------------------+------+--+ 左补足函数:lpad 右补足函数:rpad 语法: lpad(string str, int len, string pad) 说明:lpad将 str 进行用 pad 进行左补足到 len 位, rpad将 str 进行用 pad 进行右补足到 len 位 注意:与 GP,ORACLE 不同; pad不能默认 select lpad('abc',10,'td'),rpad('abc',10,'td') from order_detail limit 1; +-------------+-------------+--+ | _c0 | _c1 | +-------------+-------------+--+ | tdtdtdtabc | abctdtdtdt | +-------------+-------------+--+ 分割字符串函数: split 集合查找函数: find_in_set 语法: find_in_set(string str, string strList) 说明: 返回 str 在 strlist 第一次出现的位置, strlist 是用逗号分割的字符串。如果没有找该 str 字符,则返回 0 select split('abtcdtef','t'), find_in_set('ab','ef,ab,de'), find_in_set('at','ef,ab,de') from order_detail limit 1; +-------------------+------+------+--+ | _c0 | _c1 | _c2 | +-------------------+------+------+--+ | ["ab","cd","ef"] | 2 | 0 | +-------------------+------+------+--+ ## string转map:str_to_map 语法:str_to_map(text[, delimiter1, delimiter2]) 说明:使用两个分隔符将文本拆分为键值对。 Delimiter1将文本分成K-V对,Delimiter2分割每个K-V对。 对于delimiter1默认分隔符是',',对于delimiter2默认分隔符是':'。 select str_to_map('aaa:11&bbb:22', '&', ':') from order_detail limit 1; +--------------------------+--+ | _c0 | +--------------------------+--+ | {"bbb":"22","aaa":"11"} | +--------------------------+--+ select str_to_map('aaa:11&bbb:22', '&', ':')['aaa'] from order_detail limit 1; +------+--+ | _c0 | +------+--+ | 11 | +------+--+ select str_to_map('aaa:11,bbb:22') from person limit 1; {"bbb":"22","aaa":"11"}

    集合统计函数

    ## 个数统计:count 总和统计:sum 语法: count(*), count(expr), count(DISTINCT expr[, expr_.]) 说明: count(*)统计检索出的行的个数,包括 NULL 值的行; count(expr)返回指定字段的非空值的个数; count(DISTINCT expr[, expr_.])返回指定字段的不同的非空值的个数 语法: sum(col), sum(DISTINCT col) 说明: sum(col)统计结果集中 col 的相加的结果; sum(DISTINCT col)统计结果中 col 不同值 select count(*),count(user_type),count(distinct user_type), sum(sales),sum(distinct sales) from order_detail; +------+------+------+------+------+--+ | _c0 | _c1 | _c2 | _c3 | _c4 | +------+------+------+------+------+--+ | 10 | 10 | 2 | 29 | 17 | +------+------+------+------+------+--+ ## 平均值统计:avg 最小值统计:min 最大值统计:max select avg(sales),avg(distinct sales),min(sales),max(distinct sales) from order_detail; +------+------+------+------+--+ | _c0 | _c1 | _c2 | _c3 | +------+------+------+------+--+ | 2.9 | 3.4 | 1 | 6 | +------+------+------+------+--+ ## 标准差:stddev_samp, stddev, stddev_pop stddev_pop <==> stddev ## 方差:var_samp, var_pop 当我们需要真实的标准差/方差的时候最好是使用: stddev stddev_pop var_pop 而只是需要得到少量数据的标准差/方差的近似值可以选用: stddev_samp var_samp select var_pop(sales),var_samp(sales), stddev_pop(sales),stddev(sales),stddev_samp(sales), pow(stddev_pop(sales),2),pow(stddev_samp(sales),2) from order_detail; +-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+ | _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | +-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+ | 3.09 | 3.433333333333333 | 1.7578395831246945 | 1.7578395831246945 | 1.8529256146249728 | 3.0899999999999994 | 3.4333333333333336 | +-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+ ## 百分位数: percentile 近似百分位数: percentile_approx 直方图: histogram_numeric 语法: percentile_approx(DOUBLE col, p [, B]) 返回值: double 说明: 求近似的第 pth 个百分位数, p 必须介于 0 和 1 之间,返回类型为 double,但是col 字段支持浮点类型。 参数 B 控制内存消耗的近似精度, B越大,结果的准确度越高。默认为 10,000。 当 col 字段中的 distinct 值的个数小于 B 时,结果为准确的百分位数 select percentile(sales,0.2), percentile_approx(sales,0.5,10000), histogram_numeric(sales,5) from order_detail; +------+------+----------------------------------------------------------------------------------------------+--+ | _c0 | _c1 | _c2 | +------+------+----------------------------------------------------------------------------------------------+--+ | 1.0 | 2.0 | [{"x":1.0,"y":3.0},{"x":2.0,"y":2.0},{"x":3.0,"y":2.0},{"x":5.0,"y":2.0},{"x":6.0,"y":1.0}] | +------+------+----------------------------------------------------------------------------------------------+--+ select inline(histogram_numeric(sales,10)) from order_detail; +------+------+--+ | x | y | +------+------+--+ | 1.0 | 3.0 | | 2.0 | 2.0 | | 3.0 | 2.0 | | 5.0 | 2.0 | | 6.0 | 1.0 | +------+------+--+ ### 后面可以输入多个百分位数,返回类型也为 array<double>,其中为对应的百分位数。 select percentile(sales,array(0.2,0.4,0.6)), percentile_approx(sales,array(0.2,0.4,0.6),10000) from order_detail; +----------------+----------------+--+ | _c0 | _c1 | +----------------+----------------+--+ | [1.0,2.0,3.0] | [1.0,1.5,2.5] | +----------------+----------------+--+

    复杂类型访问操作及统计函数

    测试数据集: tony 1338 hello,woddd 1,2 a1,a2,a3 k1:1.0,k2:2.0,k3:3.0 s1,s2,s3,4 mark 5453 kke,ladyg 2,3 a4,a5,a6 k4:4.0,k5:5.0,k2:6.0 s4,s5,s6,6 ivyfd 4323 aa,thq,dsx 3,6 a7,a8,a9 k7:7.0,k8:8.0,k2:9.0 s7,s8,s9,9 drop table employees; create external table if not exists employees( name string, salary string, happy_word string, happy_num array<int>, subordinates array<string>, deductions map<string,float>, address struct<street:string,city:string,state:string,zip:int> ) row format delimited fields terminated by '\t' collection items terminated by ',' map keys terminated by ':' lines terminated by '\n' stored as textfile; hdfs dfs -put /home/liguodong/data/muldata.txt /temp/lgd load data inpath '/temp/lgd/muldata.txt' overwrite into table employees; select * from employees; Getting log thread is interrupted, since query is done! +--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+ | name | salary | happy_word | happy_num | subordinates | deductions | address | +--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+ | tony | 1338 | hello,woddd | [1,2] | ["a1","a2","a3"] | {"k1":1.0,"k2":2.0,"k3":3.0} | {"street":"s1","city":"s2","state":"s3","zip":4} | | mark | 5453 | kke,ladyg | [2,3] | ["a4","a5","a6"] | {"k4":4.0,"k5":5.0,"k2":6.0} | {"street":"s4","city":"s5","state":"s6","zip":6} | | ivyfd | 4323 | aa,thq,dsx | [3,6] | ["a7","a8","a9"] | {"k7":7.0,"k8":8.0,"k2":9.0} | {"street":"s7","city":"s8","state":"s9","zip":9} | +--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+ ## 访问数组 Map 结构体 select name,salary, subordinates[1],deductions['k2'],deductions['k3'],address.city from employees; +--------+---------+------+------+-------+-------+--+ | name | salary | _c2 | _c3 | _c4 | city | +--------+---------+------+------+-------+-------+--+ | tony | 1338 | a2 | 2.0 | 3.0 | s2 | | mark | 5453 | a5 | 6.0 | NULL | s5 | | ivyfd | 4323 | a8 | 9.0 | NULL | s8 | +--------+---------+------+------+-------+-------+--+ ## Map类型长度 Array类型长度 select size(deductions),size(subordinates) from employees limit 1; +------+------+--+ | _c0 | _c1 | +------+------+--+ | 3 | 3 | +------+------+--+ ## 类型转换: cast select cast(salary as int),cast(deductions['k2'] as bigint) from employees; +---------+------+--+ | salary | _c1 | +---------+------+--+ | 1338 | 2 | | 5453 | 6 | | 4323 | 9 | +---------+------+--+ ### LATERAL VIEW 行转列 SELECT name, ad_subordinate FROM employees LATERAL VIEW explode(subordinates) addTable AS ad_subordinate; +--------+-----------------+--+ | name | ad_subordinate | +--------+-----------------+--+ | tony | a1 | | tony | a2 | | tony | a3 | | mark | a4 | | mark | a5 | | mark | a6 | | ivyfd | a7 | | ivyfd | a8 | | ivyfd | a9 | +--------+-----------------+--+ SELECT name, count(1) FROM employees LATERAL VIEW explode(subordinates) addTable AS ad_subordinate group by name; +--------+------+--+ | name | _c1 | +--------+------+--+ | ivyfd | 3 | | mark | 3 | | tony | 3 | +--------+------+--+ SELECT ad_subordinate, ad_num FROM employees LATERAL VIEW explode(subordinates) addTable AS ad_subordinate LATERAL VIEW explode(happy_num) addTable2 AS ad_num; +-----------------+---------+--+ | ad_subordinate | ad_num | +-----------------+---------+--+ | a1 | 1 | | a1 | 2 | | a2 | 1 | | a2 | 2 | | a3 | 1 | | a3 | 2 | | a4 | 2 | | a4 | 3 | | a5 | 2 | | a5 | 3 | | a6 | 2 | | a6 | 3 | | a7 | 3 | | a7 | 6 | | a8 | 3 | | a8 | 6 | | a9 | 3 | | a9 | 6 | +-----------------+---------+--+ ### 多个LATERAL VIEW SELECT name, count(1) FROM employees LATERAL VIEW explode(subordinates) addTable AS ad_subordinate LATERAL VIEW explode(happy_num) addTable2 AS ad_num group by name; +--------+------+--+ | name | _c1 | +--------+------+--+ | ivyfd | 6 | | mark | 6 | | tony | 6 | +--------+------+--+ ### 不满足条件产生空行 SELECT AA.name, BB.* FROM employees AA LATERAL VIEW explode(array()) BB AS a limit 10; +-------+----+--+ | name | a | +-------+----+--+ +-------+----+--+ ### OUTER 避免永远不产生结果,无满足条件的行,在该列会产生NULL值。 SELECT AA.name, BB.* FROM employees AA LATERAL VIEW OUTER explode(array()) BB AS a limit 10; +--------+-------+--+ | name | a | +--------+-------+--+ | tony | NULL | | mark | NULL | | ivyfd | NULL | +--------+-------+--+ ### 字符串切分成多列 SELECT name, word FROM employees LATERAL VIEW explode(split(happy_word,',')) addTable AS word; +--------+--------+--+ | name | word | +--------+--------+--+ | tony | hello | | tony | woddd | | mark | kke | | mark | ladyg | | ivyfd | aa | | ivyfd | thq | | ivyfd | dsx | +--------+--------+--+
    转载请注明原文地址: https://ju.6miu.com/read-6049.html

    最新回复(0)