山西省网站建设哪里好,申请邮箱企业邮箱,全国企业信用信息公示系统辽宁,呼伦贝尔市建设网站文章目录一、创建数据库二、ODS 层#xff08;原始数据层#xff09;三、DWD 层#xff08;明细数据层#xff09;3.1 get_json_object 函数使用3.2 启动日志表 DWD层创建四、DWS 层#xff08;服务数据层#xff09;五、DWT 层#xff08;数据主题层#xff09;六、AD…
文章目录一、创建数据库二、ODS 层原始数据层三、DWD 层明细数据层3.1 get_json_object 函数使用3.2 启动日志表 DWD层创建四、DWS 层服务数据层五、DWT 层数据主题层六、ADS 层数据应用层保持数据原貌不做任何修改起到备份数据的作用数据采用LZO压缩减少磁盘存储空间。100G数据可以压缩到10G以内创建分区表防止后续的全表扫描在企业开发中大量使用分区表创建外部表。在企业开发中除了自己用的临时表创建内部表外绝大多数场景都是创建外部表
一、创建数据库
[roothadoop100 hive-3.1.2]# bin/hivehive (default) create database mall;
hive (default) use mall二、ODS 层原始数据层
创建 ODS 层表通用步骤如下
① 创建启动日志表
hive (mall) drop table if exists ods_event_log;
hive (mall) create external table ods_event_log(line string)
partitioned by(dt string)
stored AS inputformat com.hadoop.mapred.DeprecatedLzoTextInputFormat
outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
location /warehouse/mall/ods/ods_event_log;② 加载数据
hive (mall) load data inpath /origin_data/mall/log/topic_event/2021-01-08
into table mall.ods_event_log partition(dt2021-01-08);③ 为 lzo 压缩文件创建索引
[roothadoop100 ~]# hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar \
com.hadoop.compression.lzo.DistributedLzoIndexer /warehouse/mall/ods/ods_event_log/dt2021-01-08④ 查看是否加载成功
hive (mall) select * from ods_event_log limit 1;
{action:1,ar:MX,ba:Huawei,detail:,en:start,entry:3,extend1:,
g:8844J1F0gmail.com,hw:750*1134,l:es,la:-36.5,ln:-43.3,
loading_time:15,md:Huawei-2,mid:2,nw:WIFI,open_ad_type:1,os:8.2.8,
sr:L,sv:V2.3.6,t:1609368942552,uid:2,vc:19,vn:1.0.1} 2021-01-08
Time taken: 0.214 seconds, Fetched: 1 row(s)⑤ 通用加载数据脚本
#!/bin/bashdbmall
hive/opt/module/hive/bin/hive-3.1.2
do_datedate -d -1 day %Fif [[ -n $1 ]]; thendo_date$1
fisql
load data inpath /origin_data/mall/log/topic_start/$do_date into table ${db}.ods_start_log partition(dt$do_date);
load data inpath /origin_data/mall/log/topic_event/$do_date into table ${db}.ods_event_log partition(dt$do_date);
$hive -e $sql
hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /warehouse/mall/ods/ods_start_log/dt$do_date
hadoop jar /opt/module/hadoop-2.7.2/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /warehouse/mall/ods/ods_event_log/dt$do_dateShell中单引号和双引号区别 单引号不取变量值双引号取变量值反引号执行引号中命令双引号内部嵌套单引号取出变量值单引号内部嵌套双引号不取出变量值 三、DWD 层明细数据层
对用户行为数据解析对核心数据进行判空过滤对业务数据采用维度模型重新建模即维度退化
3.1 get_json_object 函数使用
数据 xjson[{name:大郎,sex:男,age:25},{name:西门庆,sex:男,age:47}]
取出第一个 json 对象select get_json_object([{name:大郎,sex:男,age:25},{name:西门庆,sex:男,age:47}],$[0]);
结果{name:大郎,sex:男,age:25}
取出第一个 json 的 age 字段的值SELECT get_json_object([{name:大郎,sex:男,age:25},{name:西门庆,sex:男,age:47}],$[0].age);
结果25
3.2 启动日志表 DWD层创建
① 创建启动日志表
hive (mall) CREATE EXTERNAL TABLE dwd_start_log(
mid_id string,
user_id string,
version_code string,
version_name string,
lang string,
source string,
os string,
area string,
model string,
brand string,
sdk_version string,
gmail string,
height_width string,
app_time string,
network string,
lng string,
lat string,
entry string,
open_ad_type string,
action string,
loading_time string,
detail string,
extend1 string
)
PARTITIONED BY (dt string)
stored as parquet
location /warehouse/mall/dwd/dwd_start_log/
TBLPROPERTIES(parquet.compressionlzo);② 导入数据
hive (mall) insert overwrite table dwd_start_log
PARTITION (dt2021-01-11)
select get_json_object(line,$.mid) mid_id,get_json_object(line,$.uid) user_id,get_json_object(line,$.vc) version_code,get_json_object(line,$.vn) version_name,get_json_object(line,$.l) lang,get_json_object(line,$.sr) source,get_json_object(line,$.os) os,get_json_object(line,$.ar) area,get_json_object(line,$.md) model,get_json_object(line,$.ba) brand,get_json_object(line,$.sv) sdk_version,get_json_object(line,$.g) gmail,get_json_object(line,$.hw) height_width,get_json_object(line,$.t) app_time,get_json_object(line,$.nw) network,get_json_object(line,$.ln) lng,get_json_object(line,$.la) lat,get_json_object(line,$.entry) entry,get_json_object(line,$.open_ad_type) open_ad_type,get_json_object(line,$.action) action,get_json_object(line,$.loading_time) loading_time,get_json_object(line,$.detail) detail,get_json_object(line,$.extend1) extend1
from ods_start_log
where dt2021-01-11;③ 通用加载数据脚本
#!/bin/bash# 定义变量方便修改
APPmall
hive/opt/module/hive/bin/hive-3.1.2# 如果是输入的日期按照取输入日期如果没输入日期取当前时间的前一天
if [ -n $1 ] ;thendo_date$1
else do_datedate -d -1 day %F
fi sql
insert overwrite table $APP.dwd_start_log
PARTITION (dt$do_date)
select get_json_object(line,$.mid) mid_id,get_json_object(line,$.uid) user_id,get_json_object(line,$.vc) version_code,get_json_object(line,$.vn) version_name,get_json_object(line,$.l) lang,get_json_object(line,$.sr) source,get_json_object(line,$.os) os,get_json_object(line,$.ar) area,get_json_object(line,$.md) model,get_json_object(line,$.ba) brand,get_json_object(line,$.sv) sdk_version,get_json_object(line,$.g) gmail,get_json_object(line,$.hw) height_width,get_json_object(line,$.t) app_time,get_json_object(line,$.nw) network,get_json_object(line,$.ln) lng,get_json_object(line,$.la) lat,get_json_object(line,$.entry) entry,get_json_object(line,$.open_ad_type) open_ad_type,get_json_object(line,$.action) action,get_json_object(line,$.loading_time) loading_time,get_json_object(line,$.detail) detail,get_json_object(line,$.extend1) extend1
from $APP.ods_start_log
where dt$do_date;
$hive -e $sql四、DWS 层服务数据层
五、DWT 层数据主题层
六、ADS 层数据应用层