接下來,我們要做的步驟有點複雜。稱為 CTAS 結構,也就是 CREAT TABLE … AS SELECT 的縮寫。之所以需要這麼做,是因為有時候必須要將多個資料表,轉換成一個龐大的表,以方便進行統計分析。
更多關於 Hive 的語法,請參考 官方網站說明 Hive Data Definition Language。
根據第一行的資料,我們可以決定資料表每個欄位的資料型態
欄位 | 範例 | 資料型態 |
---|---|---|
playerID | aardsda01 | STRING |
yearID | 2004 | INT |
stint | 1 | INT |
teamID | SFN | STRING |
lgID | NL | STRING |
W | 1 | INT |
L | 0 | INT |
G | 11 | INT |
GS | 0 | INT |
CG | 0 | INT |
SHO | 0 | INT |
SV | 0 | INT |
IPouts | 32 | INT |
H | 20 | INT |
ER | 8 | INT |
HR | 1 | INT |
BB | 10 | INT |
SO | 5 | INT |
BAOpp | 0.41 | FLOAT |
ERA | 6.75 | FLOAT |
IBB | 0 | INT |
WP | 0 | INT |
HBP | 2 | INT |
BK | 0 | INT |
BFP | 61 | INT |
GF | 5 | INT |
R | 8 | INT |
SH | NULL INT | |
SF | NULL INT | |
GIDP | NULL INT |
hive> create table baseball_userXX.Pitching ( playerID STRING, yearID INT, stint INT, teamID STRING, lgID STRING, W INT, L INT, G INT, GS INT, CG INT, SHO INT, SV INT, IPouts INT, H INT, ER INT, HR INT, BB INT, SO INT, BAOpp FLOAT, ERA FLOAT, IBB INT, WP INT, HBP INT, BK INT, BFP INT, GF INT, R INT, SH INT, SF INT, GIDP INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
hive> LOAD DATA LOCAL INPATH "Pitching.csv" OVERWRITE INTO TABLE baseball_userXX.Pitching;
SHOW TABLES;
來查詢目前的資料庫有哪幾個資料表。確認是否有正確產生 Batting 資料表。
hive> USE baseball; SHOW TABLES; OK Time taken: 0.025 seconds OK batting master pitching Time taken: 0.078 seconds
hive> SELECT * FROM baseball_userXX.pitching;
CREATE TABLE <資料庫名稱>.<資料表名稱> AS ( SELECT <欄位> FROM <資料表A> JOIN <資料表B> ON <條件1> JOIN <資料表C> ON <條件2>);
。
hive> USE baseball_userXX; hive> CREATE DATABASE baseball_stats_userXX; hive> CREATE TABLE baseball_stats_userXX.fact_player_stats AS SELECT A.PlayerID, B.teamID, B.AB, B.R, B.H, B.B2, B.B3, B.HR, B.RBI, P.G FROM Master A JOIN BATTING B ON A.playerID = B.playerID JOIN Pitching P ON B.playerID = P.playerID;
Job 0: Map: 3 Reduce: 1 Accumulative CPU: 15.99 sec HDFS Read: 0 HDFS Write: 0 SUCESS Total MapReduce CPU Time Spent: 15 seconds 990 msec OK Time taken: 128.734 seconds
hive> SELECT * FROM baseball_stats_userXX.fact_player_stats LIMIT 10; OK aardsda01 CHA 0 0 0 0 0 0 0 45 aardsda01 CHA 0 0 0 0 0 0 0 73 aardsda01 CHA 0 0 0 0 0 0 0 11 aardsda01 CHA 0 0 0 0 0 0 0 47 aardsda01 CHA 0 0 0 0 0 0 0 25 aardsda01 CHA 0 0 0 0 0 0 0 1 aardsda01 CHA 0 0 0 0 0 0 0 53 aardsda01 BOS 1 0 0 0 0 0 0 45 aardsda01 BOS 1 0 0 0 0 0 0 73 aardsda01 BOS 1 0 0 0 0 0 0 11 Time taken: 0.177 seconds