- 工具主要目标:解析计算脚本中的HQL语句,分析得到输入输出表、输入输出字段和相应的处理条件,进行分析展现。
- 实现思路:对AST深度优先遍历,遇到操作的token则判断当前的操作,遇到子句则压栈当前处理,处理子句。子句处理完,栈弹出。处理字句的过程中,遇到子查询就保存当前子查询的信息,判断与其父查询的关系,最终形成树形结构; 遇到字段或者条件处理则记录当前的字段和条件信息、组成Block,嵌套调用。
- 关键点解析:
1、遇到TOK_TAB或TOK_TABREF则判断出当前操作的表
2、压栈判断是否是join,判断join条件
3、定义数据结构Block,遇到在where\select\join时获得其下相应的字段和条件,组成Block
4、定义数据结构ColLine,遇到TOK_SUBQUERY保存当前的子查询信息,供父查询使用
5、定义数据结构ColLine,遇到TOK_UNION结束时,合并并截断当前的列信息
6、遇到select 或者未明确指出的字段,查询元数据进行辅助分析
7、解析结果进行相关校验
package com.xiaoju.products.parse; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Stack; import java.util.Map.Entry; import java.util.LinkedHashSet; import org.antlr.runtime.tree.Tree; import org.apache.hadoop.hive.ql.parse.ASTNode; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer; import org.apache.hadoop.hive.ql.parse.HiveParser; import org.apache.hadoop.hive.ql.parse.ParseDriver; import com.xiaoju.products.bean.Block; import com.xiaoju.products.bean.ColLine; import com.xiaoju.products.bean.QueryTree; import com.xiaoju.products.exception.SQLParseException; import com.xiaoju.products.exception.UnSupportedException; import com.xiaoju.products.util.Check; import com.xiaoju.products.util.MetaCache; import com.xiaoju.products.util.NumberUtil; import com.xiaoju.products.util.ParseUtil; import com.xiaoju.products.util.PropertyFileUtil; / * hive sql解析类 * * 目的:实现HQL的语句解析,分析出输入输出表、字段和相应的处理条件。为字段级别的数据血缘提供基础。 * 重点:获取SELECT操作中的表和列的相关操作。其他操作这判断到字段级别。 * 实现思路:对AST深度优先遍历,遇到操作的token则判断当前的操作,遇到子句则压栈当前处理,处理子句。子句处理完,栈弹出。 * 处理字句的过程中,遇到子查询就保存当前子查询的信息,判断与其父查询的关系,最终形成树形结构; * 遇到字段或者条件处理则记录当前的字段和条件信息、组成Block,嵌套调用。 * 关键点解析 * 1、遇到TOK_TAB或TOK_TABREF则判断出当前操作的表 * 2、压栈判断是否是join,判断join条件 * 3、定义数据结构Block,遇到在where\select\join时获得其下相应的字段和条件,组成Block * 4、定义数据结构ColLine,遇到TOK_SUBQUERY保存当前的子查询信息,供父查询使用 * 5、定义数据结构ColLine,遇到TOK_UNION结束时,合并并截断当前的列信息 * 6、遇到select * 或者未明确指出的字段,查询元数据进行辅助分析 * 7、解析结果进行相关校验 * 试用范围: * 1、支持标准SQL * 2、不支持transform using script * * @author yangyangthomas * */ public class LineParser { private static final String SPLIT_DOT = "."; private static final String SPLIT_COMMA = ","; private static final String SPLIT_AND = "&"; private static final String TOK_EOF = " "; private static final String CON_WHERE = "WHERE:"; private static final String TOK_TMP_FILE = "TOK_TMP_FILE"; private Map > dbMap = new HashMap >(); private List queryTreeList = new ArrayList (); //子查询树形关系保存 private Stack > conditionsStack = new Stack >(); private Stack > colsStack = new Stack >(); private Map > resultQueryMap = new HashMap >(); private Set conditions = new HashSet (); //where or join 条件缓存 private List cols = new ArrayList (); //一个子查询内的列缓存 private Stack tableNameStack = new Stack (); private Stack joinStack = new Stack (); private Stack joinOnStack = new Stack (); private Map queryMap = new HashMap (); private boolean joinClause = false; private ASTNode joinOn = null; private String nowQueryDB = "default"; //hive的默认库 private boolean isCreateTable = false; //结果 private List colLines = new ArrayList (); private Set outputTables = new HashSet (); private Set inputTables = new HashSet (); private List tmpColLines = new ArrayList (); private Set tmpOutputTables = new HashSet (); private Set tmpInputTables = new HashSet (); public List getColLines() { return colLines; } public Set getOutputTables() { return outputTables; } public Set getInputTables() { return inputTables; } private void parseIteral(ASTNode ast) { prepareToParseCurrentNodeAndChilds(ast); parseChildNodes(ast); parseCurrentNode(ast); endParseCurrentNode(ast); } / * 解析当前节点 * @param ast * @param set * @return */ private void parseCurrentNode(ASTNode ast){ if (ast.getToken() != null) { switch (ast.getToken().getType()) { case HiveParser.TOK_CREATETABLE: //outputtable isCreateTable = true; String tableOut = fillDB(BaseSemanticAnalyzer.getUnescapedName((ASTNode) ast.getChild(0))); tmpOutputTables.add(tableOut); MetaCache.getInstance().init(tableOut); //初始化数据,供以后使用 break; case HiveParser.TOK_TAB:// outputTable String tableTab = BaseSemanticAnalyzer.getUnescapedName((ASTNode) ast.getChild(0)); String tableOut2 = fillDB(tableTab); tmpOutputTables.add(tableOut2); MetaCache.getInstance().init(tableOut2); //初始化数据,供以后使用 break; case HiveParser.TOK_TABREF:// inputTable ASTNode tabTree = (ASTNode) ast.getChild(0); String tableInFull = fillDB((tabTree.getChildCount() == 1) ? BaseSemanticAnalyzer.getUnescapedName((ASTNode) tabTree.getChild(0)) : BaseSemanticAnalyzer.getUnescapedName((ASTNode) tabTree.getChild(0)) + SPLIT_DOT + BaseSemanticAnalyzer.getUnescapedName((ASTNode) tabTree.getChild(1)) ); String tableIn = tableInFull.substring(tableInFull.indexOf(SPLIT_DOT) + 1); tmpInputTables.add(tableInFull); MetaCache.getInstance().init(tableInFull); //初始化数据,供以后使用 queryMap.clear(); String alia = null; if (ast.getChild(1) != null) { //(TOK_TABREF (TOK_TABNAME detail usersequence_client) c) alia = ast.getChild(1).getText().toLowerCase(); QueryTree qt = new QueryTree(); qt.setCurrent(alia); qt.getTableSet().add(tableInFull); QueryTree pTree = getSubQueryParent(ast); qt.setpId(pTree.getpId()); qt.setParent(pTree.getParent()); queryTreeList.add(qt); if (joinClause && ast.getParent() == joinOn) { // TOK_SUBQUERY join TOK_TABREF ,此处的TOK_SUBQUERY信息不应该清楚 for (QueryTree entry : queryTreeList) { //当前的查询范围 if (qt.getParent().equals(entry.getParent())) { queryMap.put(entry.getCurrent(), entry); } } } else { queryMap.put(qt.getCurrent(), qt); } } else { alia = tableIn.toLowerCase(); QueryTree qt = new QueryTree(); qt.setCurrent(alia); qt.getTableSet().add(tableInFull); QueryTree pTree = getSubQueryParent(ast); qt.setpId(pTree.getpId()); qt.setParent(pTree.getParent()); queryTreeList.add(qt); if (joinClause && ast.getParent() == joinOn) { for (QueryTree entry : queryTreeList) { if (qt.getParent().equals(entry.getParent())) { queryMap.put(entry.getCurrent(), entry); } } } else { queryMap.put(qt.getCurrent(), qt); //此处检查查询 select app.t1.c1,t1.c1 from t1 的情况 queryMap.put(tableInFull.toLowerCase(), qt); } } break; case HiveParser.TOK_SUBQUERY: if (ast.getChildCount() == 2) { String tableAlias = BaseSemanticAnalyzer.unescapeIdentifier(ast.getChild(1).getText()); String aliaReal = ""; if(aliaReal.length() !=0){ aliaReal = aliaReal.substring(0, aliaReal.length()-1); } QueryTree qt = new QueryTree(); qt.setCurrent(tableAlias.toLowerCase()); qt.setColLineList(generateColLineList(cols, conditions)); QueryTree pTree = getSubQueryParent(ast); qt.setId(generateTreeId(ast)); qt.setpId(pTree.getpId()); qt.setParent(pTree.getParent()); qt.setChildList(getSubQueryChilds(qt.getId())); if (Check.notEmpty(qt.getChildList())) { for (QueryTree cqt : qt.getChildList()) { qt.getTableSet().addAll(cqt.getTableSet()); queryTreeList.remove(cqt); // 移除子节点信息 } } queryTreeList.add(qt); cols.clear(); queryMap.clear(); for (QueryTree _qt : queryTreeList) { if (qt.getParent().equals( _qt.getParent())) { //当前子查询才保存 queryMap.put(_qt.getCurrent(), _qt); } } } break; case HiveParser.TOK_SELEXPR: //输入输出字段的处理 / * (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) * (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) * * (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) * (TOK_SELECT * (TOK_SELEXPR (. (TOK_TABLE_OR_COL p) datekey) datekey) * (TOK_SELEXPR (TOK_TABLE_OR_COL datekey)) * (TOK_SELEXPR (TOK_FUNCTIONDI count (. (TOK_TABLE_OR_COL base) userid)) buyer_count)) * (TOK_SELEXPR (TOK_FUNCTION when (> (. (TOK_TABLE_OR_COL base) userid) 5) (. (TOK_TABLE_OR_COL base) clienttype) (> (. (TOK_TABLE_OR_COL base) userid) 1) (+ (. (TOK_TABLE_OR_COL base) datekey) 5) (+ (. (TOK_TABLE_OR_COL base) clienttype) 1)) bbbaaa) */ //解析需要插入的表 Tree tok_insert = ast.getParent().getParent(); Tree child = tok_insert.getChild(0).getChild(0); String tName = BaseSemanticAnalyzer.getUnescapedName((ASTNode) child.getChild(0)); String destTable = TOK_TMP_FILE.equals(tName) ? TOK_TMP_FILE : fillDB(tName); //select a.*,* from t1 和 select * from (select c1 as a,c2 from t1) t 的情况 if (ast.getChild(0).getType() == HiveParser.TOK_ALLCOLREF) { String tableOrAlias = ""; if (ast.getChild(0).getChild(0) != null) { tableOrAlias = ast.getChild(0).getChild(0).getChild(0).getText(); } String[] result = getTableAndAlia(tableOrAlias); String _alia = result[1]; boolean isSub = false; //处理嵌套select * 的情况 if (Check.notEmpty(_alia)) { for (String string : _alia.split(SPLIT_AND)) { //迭代循环的时候查询 QueryTree qt = queryMap.get(string.toLowerCase()); if (null != qt) { List colLineList = qt.getColLineList(); if (Check.notEmpty(colLineList)) { isSub = true; for (ColLine colLine : colLineList) { cols.add(colLine); } } } } } if (!isSub) { //处理直接select * 的情况 String nowTable = result[0]; String[] tableArr = nowTable.split(SPLIT_AND); //fact.test&test2 for (String tables : tableArr) { String[] split = tables.split("\\."); if (split.length > 2) { throw new SQLParseException("parse table:" + nowTable); } List colByTab = MetaCache.getInstance().getColumnByDBAndTable(tables); for (String column : colByTab) { Set fromNameSet = new LinkedHashSet (); fromNameSet.add(tables + SPLIT_DOT + column); ColLine cl = new ColLine(column, tables + SPLIT_DOT + column, fromNameSet, new LinkedHashSet () , destTable, column); cols.add(cl); } } } } else { Block bk = getBlockIteral((ASTNode)ast.getChild(0)); String toNameParse = getToNameParse(ast, bk); Set fromNameSet = filterData(bk.getColSet()); ColLine cl = new ColLine(toNameParse, bk.getCondition(), fromNameSet, new LinkedHashSet () , destTable, ""); cols.add(cl); } break; case HiveParser.TOK_WHERE: //3、过滤条件的处理select类 conditions.add(CON_WHERE + getBlockIteral((ASTNode) ast.getChild(0)).getCondition()); break; default: / * (or * (> (. (TOK_TABLE_OR_COL p) orderid) (. (TOK_TABLE_OR_COL c) orderid)) * (and (= (. (TOK_TABLE_OR_COL p) a) (. (TOK_TABLE_OR_COL c) b)) * (= (. (TOK_TABLE_OR_COL p) aaa) (. (TOK_TABLE_OR_COL c) bbb)))) */ //1、过滤条件的处理join类 if (joinOn != null && joinOn.getTokenStartIndex() == ast.getTokenStartIndex() && joinOn.getTokenStopIndex() == ast.getTokenStopIndex()) { ASTNode astCon = (ASTNode)ast.getChild(2); conditions.add(ast.getText().substring(4) + ":" + getBlockIteral(astCon).getCondition()); break; } } } } / * 查找当前节点的父子查询节点 * @param ast */ private QueryTree getSubQueryParent(Tree ast) { Tree _tree = ast; QueryTree qt = new QueryTree(); while(!(_tree = _tree.getParent()).isNil()){ if(_tree.getType() == HiveParser.TOK_SUBQUERY){ qt.setpId(generateTreeId(_tree)); qt.setParent(BaseSemanticAnalyzer.getUnescapedName((ASTNode)_tree.getChild(1))); return qt; } } qt.setpId(-1); qt.setParent("NIL"); return qt; } private int generateTreeId(Tree tree) { return tree.getTokenStartIndex() + tree.getTokenStopIndex(); } / * 查找当前节点的子子查询节点(索引) * @param ast */ private List getSubQueryChilds(int id) { List list = new ArrayList (); for (int i = 0; i < queryTreeList.size(); i++) { QueryTree qt = queryTreeList.get(i); if (id == qt.getpId()) { list.add(qt); } } return list; } / * 获得要解析的名称 * @param ast * @param bk * @return */ private String getToNameParse(ASTNode ast, Block bk) { String alia = ""; Tree child = ast.getChild(0); if (ast.getChild(1) != null) { //有别名 ip as alia alia = ast.getChild(1).getText(); } else if (child.getType() == HiveParser.DOT //没有别名 a.ip && child.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL && child.getChild(0).getChildCount() == 1 && child.getChild(1).getType() == HiveParser.Identifier) { alia = BaseSemanticAnalyzer.unescapeIdentifier(child.getChild(1).getText()); } else if (child.getType() == HiveParser.TOK_TABLE_OR_COL //没有别名 ip && child.getChildCount() == 1 && child.getChild(0).getType() == HiveParser.Identifier) { alia = BaseSemanticAnalyzer.unescapeIdentifier(child.getChild(0).getText()); } return alia; } / * 获得解析的块,主要应用在WHERE、JOIN和SELECT端 * 如: where a=1 *
t1 join t2 on t1.col1=t2.col1 and t1.col2=123 *
select count(distinct col1) from t1 * @param ast * @return */ private Block getBlockIteral(ASTNode ast) { if (ast.getType() == HiveParser.KW_OR ||ast.getType() == HiveParser.KW_AND) { Block bk1 = getBlockIteral((ASTNode)ast.getChild(0)); Block bk2 = getBlockIteral((ASTNode)ast.getChild(1)); bk1.getColSet().addAll(bk2.getColSet()); bk1.setCondition("(" + bk1.getCondition() + " " + ast.getText() + " " + bk2.getCondition() + ")"); return bk1; } else if (ast.getType() == HiveParser.NOTEQUAL //判断条件 > < like in || ast.getType() == HiveParser.EQUAL || ast.getType() == HiveParser.LESSTHAN || ast.getType() == HiveParser.LESSTHANOREQUALTO || ast.getType() == HiveParser.GREATERTHAN || ast.getType() == HiveParser.GREATERTHANOREQUALTO || ast.getType() == HiveParser.KW_LIKE || ast.getType() == HiveParser.DIVIDE || ast.getType() == HiveParser.PLUS || ast.getType() == HiveParser.MINUS || ast.getType() == HiveParser.STAR || ast.getType() == HiveParser.MOD || ast.getType() == HiveParser.AMPERSAND || ast.getType() == HiveParser.TILDE || ast.getType() == HiveParser.BITWISEOR || ast.getType() == HiveParser.BITWISEXOR) { Block bk1 = getBlockIteral((ASTNode)ast.getChild(0)); if (ast.getChild(1) == null) { // -1 bk1.setCondition(ast.getText() + bk1.getCondition()); } else { Block bk2 = getBlockIteral((ASTNode)ast.getChild(1)); bk1.getColSet().addAll(bk2.getColSet()); bk1.setCondition(bk1.getCondition() + " " + ast.getText() + " " + bk2.getCondition()); } return bk1; } else if (ast.getType() == HiveParser.TOK_FUNCTIONDI) { Block col = getBlockIteral((ASTNode) ast.getChild(1)); String condition = ast.getChild(0).getText(); col.setCondition(condition + "(distinct (" + col.getCondition() +"))"); return col; } else if (ast.getType() == HiveParser.TOK_FUNCTION){ String fun = ast.getChild(0).getText(); Block col = ast.getChild(1) == null ? new Block() : getBlockIteral((ASTNode) ast.getChild(1)); if ("when".equalsIgnoreCase(fun)) { col.setCondition(getWhenCondition(ast)); Set processChilds = processChilds(ast, 1); col.getColSet().addAll(bkToCols(col, processChilds)); return col; } else if("IN".equalsIgnoreCase(fun)) { col.setCondition(col.getCondition() + " in (" + blockCondToString(processChilds(ast, 2)) + ")"); return col; } else if("TOK_ISNOTNULL".equalsIgnoreCase(fun) //isnull isnotnull || "TOK_ISNULL".equalsIgnoreCase(fun)){ col.setCondition(col.getCondition() + " " + fun.toLowerCase().substring(4)); return col; } else if("BETWEEN".equalsIgnoreCase(fun)){ col.setCondition(getBlockIteral((ASTNode) ast.getChild(2)).getCondition() + " between " + getBlockIteral((ASTNode) ast.getChild(3)).getCondition() + " and " + getBlockIteral((ASTNode) ast.getChild(4)).getCondition()); return col; } Set processChilds = processChilds(ast, 1); col.getColSet().addAll(bkToCols(col, processChilds)); col.setCondition(fun +"("+ blockCondToString(processChilds) + ")"); return col; } else if(ast.getType() == HiveParser.LSQUARE){ //map,array Block column = getBlockIteral((ASTNode) ast.getChild(0)); Block key = getBlockIteral((ASTNode) ast.getChild(1)); column.setCondition(column.getCondition() +"["+ key.getCondition() + "]"); return column; } else { return parseBlock(ast); } } private Set bkToCols(Block col, Set processChilds) { Set set = new LinkedHashSet (processChilds.size()); for (Block colLine : processChilds) { if (Check.notEmpty(colLine.getColSet())) { set.addAll(colLine.getColSet()); } } return set; } private String blockCondToString(Set processChilds) { StringBuilder sb = new StringBuilder(); for (Block colLine : processChilds) { sb.append(colLine.getCondition()).append(SPLIT_COMMA); } if (sb.length()>0) { sb.setLength(sb.length()-1); } return sb.toString(); } / * 解析when条件 * @param ast * @return case when c1>100 then col1 when c1>0 col2 else col3 end */ private String getWhenCondition(ASTNode ast) { int cnt = ast.getChildCount(); StringBuilder sb = new StringBuilder(); for (int i = 1; i < cnt; i++) { String condition = getBlockIteral((ASTNode)ast.getChild(i)).getCondition(); if (i == 1) { sb.append("(case when " + condition); } else if (i == cnt-1) { //else sb.append(" else " + condition + " end)"); } else if (i % 2 == 0){ //then sb.append(" then " + condition); } else { sb.append(" when " + condition); } } return sb.toString(); } / * 保存subQuery查询别名和字段信息 * @param sqlIndex * @param tableAlias */ private void putResultQueryMap(int sqlIndex, String tableAlias) { List list = generateColLineList(cols, conditions); String key = sqlIndex == 0 ? tableAlias : tableAlias + sqlIndex; //没有重名的情况就不用标记 resultQueryMap.put(key, list); } private List generateColLineList(List cols, Set conditions) { List list = new ArrayList (); for (ColLine entry : cols) { entry.getConditionSet().addAll(conditions); list.add(ParseUtil.cloneColLine(entry)); } return list; } / * 判断正常列, * 正常:a as col, a * 异常:1 ,'a' //数字、字符等作为列名 */ private boolean notNormalCol(String column) { return Check.isEmpty(column) || NumberUtil.isNumeric(column) || (column.startsWith("\"") && column.endsWith("\"")) || (column.startsWith("\'") && column.endsWith("\'")); } / * 从指定索引位置开始解析子树 * @param ast * @param startIndex 开始索引 * @param isSimple 是否简写 * @param withCond 是否包含条件 * @return */ private Set processChilds(ASTNode ast,int startIndex) { int cnt = ast.getChildCount(); Set set = new LinkedHashSet (); for (int i = startIndex; i < cnt; i++) { Block bk = getBlockIteral((ASTNode) ast.getChild(i)); if (Check.notEmpty(bk.getCondition()) || Check.notEmpty(bk.getColSet())){ set.add(bk); } } return set; } / * 解析获得列名或者字符数字等和条件 * @param ast * @param isSimple * @return */ private Block parseBlock(ASTNode ast) { if (ast.getType() == HiveParser.DOT && ast.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL && ast.getChild(0).getChildCount() == 1 && ast.getChild(1).getType() == HiveParser.Identifier) { String column = BaseSemanticAnalyzer.unescapeIdentifier(ast.getChild(1).getText()); String alia = BaseSemanticAnalyzer.unescapeIdentifier(ast.getChild(0).getChild(0).getText()); return getBlock(column, alia); } else if (ast.getType() == HiveParser.TOK_TABLE_OR_COL && ast.getChildCount() == 1 && ast.getChild(0).getType() == HiveParser.Identifier) { String column = ast.getChild(0).getText(); return getBlock(column, null); } else if (ast.getType() == HiveParser.Number || ast.getType() == HiveParser.StringLiteral || ast.getType() == HiveParser.Identifier) { Block bk = new Block(); bk.setCondition(ast.getText()); bk.getColSet().add(ast.getText()); return bk; } return new Block(); } / * 根据列名和别名获得块信息 * @param column * @param alia * @param tree 当前子查询下的别名可以使用 * @return */ private Block getBlock(String column, String alia) { String[] result = getTableAndAlia(alia); String tableArray = result[0]; String _alia = result[1]; for (String string : _alia.split(SPLIT_AND)) { //迭代循环的时候查询 QueryTree qt = queryMap.get(string.toLowerCase()); if (Check.notEmpty(column)) { for (ColLine colLine : qt.getColLineList()) { if (column.equalsIgnoreCase(colLine.getToNameParse())) { Block bk = new Block(); bk.setCondition(colLine.getColCondition()); bk.setColSet(ParseUtil.cloneSet(colLine.getFromNameSet())); return bk; } } } } String _realTable = tableArray; int cnt = 0; //匹配字段和元数据字段相同数目,如果有多个匹配,即此sql有二义性 for (String tables : tableArray.split(SPLIT_AND)) { //初始化的时候查询数据库对应表 String[] split = tables.split("\\."); if (split.length > 2) { throw new SQLParseException("parse table:" + tables); } List colByTab = MetaCache.getInstance().getColumnByDBAndTable(tables); for (String col : colByTab) { if (column.equalsIgnoreCase(col)) { _realTable = tables; cnt++; } } } // if (cnt == 0) { //此类没有找到的检查在Validater类中检查 // } if (cnt > 1) { //二义性检查 throw new SQLParseException("SQL is ambiguity, column: " + column + " tables:" + tableArray); } Block bk = new Block(); bk.setCondition(_realTable + SPLIT_DOT + column); bk.getColSet().add(_realTable + SPLIT_DOT + column); return bk; } / * 过滤掉无用的列:如col1,123,'2013',col2 ==>> col1,col2 * @param col * @return */ private Set filterData(Set colSet){ Set set = new LinkedHashSet (); for (String string : colSet) { if (!notNormalCol(string)) { set.add(string); } } return set; } / * 解析所有子节点 * @param ast * @return */ private void parseChildNodes(ASTNode ast){ int numCh = ast.getChildCount(); if (numCh > 0) { for (int num = 0; num < numCh; num++) { ASTNode child = (ASTNode) ast.getChild(num); parseIteral(child); } } } / * 准备解析当前节点 * @param ast */ private void prepareToParseCurrentNodeAndChilds(ASTNode ast){ if (ast.getToken() != null) { switch (ast.getToken().getType()) { case HiveParser.TOK_SWITCHDATABASE: System.out.println("nowQueryDB changed " + nowQueryDB+ " to " +ast.getChild(0).getText()); nowQueryDB = ast.getChild(0).getText(); break; case HiveParser.TOK_TRANSFORM: throw new UnSupportedException("no support transform using clause"); case HiveParser.TOK_RIGHTOUTERJOIN: case HiveParser.TOK_LEFTOUTERJOIN: case HiveParser.TOK_JOIN: case HiveParser.TOK_LEFTSEMIJOIN: case HiveParser.TOK_MAPJOIN: case HiveParser.TOK_FULLOUTERJOIN: case HiveParser.TOK_UNIQUEJOIN: joinStack.push(joinClause); joinClause = true; joinOnStack.push(joinOn); joinOn = ast; break; } } } / * 结束解析当前节点 * @param ast */ private void endParseCurrentNode(ASTNode ast){ if (ast.getToken() != null) { Tree parent = ast.getParent(); switch (ast.getToken().getType()) { //join 从句结束,跳出join case HiveParser.TOK_RIGHTOUTERJOIN: case HiveParser.TOK_LEFTOUTERJOIN: case HiveParser.TOK_JOIN: case HiveParser.TOK_LEFTSEMIJOIN: case HiveParser.TOK_MAPJOIN: case HiveParser.TOK_FULLOUTERJOIN: case HiveParser.TOK_UNIQUEJOIN: joinClause = joinStack.pop(); joinOn = joinOnStack.pop(); break; case HiveParser.TOK_QUERY: processUnionStack(ast, parent); //union的子节点 case HiveParser.TOK_INSERT: case HiveParser.TOK_SELECT: break; case HiveParser.TOK_UNION: //合并union字段信息 mergeUnionCols(); processUnionStack(ast, parent); //union的子节点 break; } } } private void mergeUnionCols() { validateUnion(cols); int size = cols.size(); int colNum = size / 2; List list = new ArrayList (colNum); for (int i = 0; i < colNum; i++) { //合并字段 ColLine col = cols.get(i); for (int j = i + colNum; j < size; j = j + colNum) { ColLine col2 = cols.get(j); list.add(col2); if (notNormalCol(col.getToNameParse()) && !notNormalCol(col2.getToNameParse())) { col.setToNameParse(col2.getToNameParse()); } col.getFromNameSet().addAll(col2.getFromNameSet()); col.setColCondition(col.getColCondition() + SPLIT_AND + col2.getColCondition()); Set conditionSet = ParseUtil.cloneSet(col.getConditionSet()); conditionSet.addAll(col2.getConditionSet()); conditionSet.addAll(conditions); col.getConditionSet().addAll(conditionSet); } } cols.removeAll(list); //移除已经合并的数据 } private void processUnionStack(ASTNode ast, Tree parent) { boolean isNeedAdd = parent.getType() == HiveParser.TOK_UNION; if (isNeedAdd) { if (parent.getChild(0) == ast && parent.getChild(1) != null) {//有弟节点(是第一节点) //压栈 conditionsStack.push(ParseUtil.cloneSet(conditions)); conditions.clear(); colsStack.push(ParseUtil.cloneList(cols)); cols.clear(); } else { //无弟节点(是第二节点) //出栈 if (!conditionsStack.isEmpty()) { conditions.addAll(conditionsStack.pop()); } if (!colsStack.isEmpty()) { cols.addAll(0, colsStack.pop()); } } } } private void parseAST(ASTNode ast) { parseIteral(ast); } public void parse(String sqlAll) throws Exception{ if (Check.isEmpty(sqlAll)) { return; } startParseAll(); //清空最终结果集 int i = 0; //当前是第几个sql for (String sql : sqlAll.split("(? > map = new HashMap >(); for (Entry > entry : resultQueryMap.entrySet()) { if (entry.getKey().startsWith(TOK_EOF)) { List value = entry.getValue(); for (ColLine colLine : value) { List list = map.get(colLine.getToTable()); if (Check.isEmpty(list)) { list = new ArrayList (); map.put(colLine.getToTable(), list); } list.add(colLine); } } } for (Entry > entry : map.entrySet()) { String table = entry.getKey(); List pList = entry.getValue(); List dList = dbMap.get(table); int metaSize = Check.isEmpty(dList) ? 0 : dList.size(); for (int i = 0; i < pList.size(); i++) { //按顺序插入对应的字段 ColLine clp = pList.get(i); String colName = null; if (i < metaSize) { colName = table + SPLIT_DOT + dList.get(i); } if (isCreateTable && TOK_TMP_FILE.equals(table)) { for (String string : tmpOutputTables) { table = string; } } ColLine colLine = new ColLine(clp.getToNameParse(), clp.getColCondition(), clp.getFromNameSet(), clp.getConditionSet(), table, colName); colLines.add(colLine); } } } /* * 设置输出表的字段对应关系 */ private void setOutInputTableSet() { outputTables.addAll(ParseUtil.cloneSet(tmpOutputTables)); inputTables.addAll(ParseUtil.cloneSet(tmpInputTables)); } private void putDBMap() { for (String table : tmpOutputTables) { List list = MetaCache.getInstance().getColumnByDBAndTable(table); dbMap.put(table, list); } } / * 补全db信息 * table1 ==>> db1.table1 * db1.table1 ==>> db1.table1 * db2.t1&t2 ==>> db2.t1&db1.t2 * @param tables */ private String fillDB(String nowTable) { if (Check.isEmpty(nowTable)) { return nowTable; } StringBuilder sb = new StringBuilder(); String[] tableArr = nowTable.split(SPLIT_AND); //fact.test&test2&test3 for (String tables : tableArr) { String[] split = tables.split("\\" + SPLIT_DOT); if (split.length > 2) { System.out.println(tables); throw new SQLParseException("parse table:" + nowTable); } String db = split.length == 2 ? split[0] : nowQueryDB ; String table = split.length == 2 ? split[1] : split[0] ; sb.append(db).append(SPLIT_DOT).append(table).append(SPLIT_AND); } if (sb.length()>0) { sb.setLength(sb.length()-1); } return sb.toString(); } / * 根据别名查询表明 * @param alia * @return */ private String[] getTableAndAlia(String alia) { String _alia = Check.notEmpty(alia) ? alia : ParseUtil.collectionToString(queryMap.keySet(), SPLIT_AND, true) ; String[] result = {"" , _alia}; Set tableSet = new HashSet (); if (Check.notEmpty(_alia)) { String[] split = _alia.split(SPLIT_AND); for (String string : split) { //别名又分单独起的别名 和 表名,即 select a.col,table_name.col from table_name a if (tmpInputTables.contains(string) || tmpInputTables.contains(fillDB(string))) { tableSet.add(fillDB(string)); } else if (queryMap.containsKey(string.toLowerCase())) { tableSet.addAll(queryMap.get(string.toLowerCase()).getTableSet()); } } result[0] = ParseUtil.collectionToString(tableSet, SPLIT_AND, true); result[1] = _alia; } return result; } / * 校验union * @param list */ private void validateUnion(List list){ int size = list.size(); if (size % 2 == 1) { throw new SQLParseException("union column number are different, size=" + size); } int colNum = size / 2; checkUnion(list, 0, colNum); checkUnion(list, colNum, size); } private void checkUnion(List list, int start, int end) { String tmp = null; for (int i = start; i < end; i++) { //合并字段 ColLine col = list.get(i); if (Check.isEmpty(tmp)) { tmp = col.getToTable(); } else if (!tmp.equals(col.getToTable())){ throw new SQLParseException("union column number/types are different,table1=" + tmp +",table2="+ col.getToTable()); } } } }
另,附上一AST语法树格式化的python脚本。
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' Created on 2012-5-20 ''' import sys # explain select key from kv mykv join test mytest on (mykv.key == mytest.id); original_str = """ (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME dim_city))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL city_name)) (TOK_SELEXPR (TOK_TABLE_OR_COL city_id)) (TOK_SELEXPR (TOK_TABLE_OR_COL pt))) (TOK_WHERE (AND (= (TOK_TABLE_OR_COL pt) '$yesday') (= (TOK_TABLE_OR_COL level) 2))) (TOK_GROUPBY (TOK_TABLE_OR_COL city_name) (TOK_TABLE_OR_COL city_id) (TOK_TABLE_OR_COL pt)))) b) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME dw_dri_wide_sheet))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL city_id)) (TOK_SELEXPR (TOK_TABLE_OR_COL pt)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION when (= (TOK_FUNCTION to_date (TOK_TABLE_OR_COL last_sucgrabord_time)) '$data_desc') (TOK_TABLE_OR_COL dri_id))) last1_dri_cnt) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_FUNCTION when (and (> (TOK_FUNCTION to_date (TOK_TABLE_OR_COL last_sucgrabord_time)) (TOK_FUNCTION date_sub '$data_desc' 7)) (<= (TOK_FUNCTION to_date (TOK_TABLE_OR_COL last_sucgrabord_time)) '$data_desc')) (TOK_TABLE_OR_COL dri_id))) last7_dri_cnt)) (TOK_WHERE (and (= (TOK_TABLE_OR_COL pt) '$data_desc') (TOK_FUNCTION TOK_ISNOTNULL (TOK_TABLE_OR_COL last_sucgrabord_time)))) (TOK_GROUPBY (TOK_TABLE_OR_COL city_id) (TOK_TABLE_OR_COL pt)))) a) (= (. (TOK_TABLE_OR_COL a) city_id) (. (TOK_TABLE_OR_COL b) city_id)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test kd_st_kpi_dri_active_day_city_bi))) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) city_name)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) city_id)) (TOK_SELEXPR (TOK_FUNCTION nvl (TOK_TABLE_OR_COL last1_dri_cnt) 0)) (TOK_SELEXPR (TOK_FUNCTION nvl (TOK_TABLE_OR_COL last7_dri_cnt) 0)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) pt)))))
""" tmp_str = original_str.strip().replace('\n', '') def my_print(mystr): sys.stdout.write(mystr) def print_indent(indent_level): for i in range(indent_level): my_print(' ' * 4) indent_level = 0 for char in tmp_str: if char == '(': # 如果是左括号,先换行,然后打印缩进+( my_print('\n') print_indent(indent_level) my_print(char) indent_level += 1 elif char == ')': # 如果是右括号,先打印),再换行,打印下一级别的缩进 indent_level -= 1 my_print(char) my_print('\n') print_indent(indent_level - 1) else: # 其他的直接打印出来 my_print(char)
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/221931.html原文链接:https://javaforall.net
