-
Notifications
You must be signed in to change notification settings - Fork 4.8k
HIVE-29551: Avoid quadratic runtime in ColumnStatsSemanticAnalyzer#ge… #6443
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 14 commits
c41a81b
6ddb7ea
be572d3
699be1e
7941487
c903dbb
cc7cee3
06479a4
4ca13d8
a269464
afc8188
2c7c593
2b1f556
c030988
68446bd
37d1c22
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |
| import com.google.common.base.Preconditions; | ||
| import java.util.ArrayList; | ||
| import java.util.Collections; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Objects; | ||
|
|
@@ -76,8 +77,7 @@ | |
| private boolean isRewritten; | ||
|
|
||
| private boolean isTableLevel; | ||
| private List<String> colNames; | ||
| private List<String> colType; | ||
| private FieldSchemas rewrittenColumnSchemas; | ||
| private Table tbl; | ||
|
|
||
| public ColumnStatsSemanticAnalyzer(QueryState queryState) throws SemanticException { | ||
|
|
@@ -103,37 +103,36 @@ | |
| } | ||
|
|
||
| /** | ||
| * Get the names of the columns that support column statistics. | ||
| * Get the Field Schemas of the columns that support column statistics. | ||
| */ | ||
| private static List<String> getColumnNamesSupportingStats(Table tbl) { | ||
| List<String> colNames = new ArrayList<>(); | ||
| private static FieldSchemas getStatsEligibleFieldSchemas(Table tbl) { | ||
| List<FieldSchema> result = new ArrayList<>(); | ||
| for (FieldSchema col : tbl.getCols()) { | ||
| String type = col.getType(); | ||
| TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type); | ||
| boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo); | ||
| if (isSupported) { | ||
| colNames.add(col.getName()); | ||
| result.add(col); | ||
| } | ||
| } | ||
| return colNames; | ||
| return new FieldSchemas(result); | ||
| } | ||
|
|
||
| private List<String> getColumnName(ASTNode tree) throws SemanticException { | ||
|
|
||
| switch (tree.getChildCount()) { | ||
| case 2: | ||
| return getColumnNamesSupportingStats(tbl); | ||
| case 3: | ||
| int numCols = tree.getChild(2).getChildCount(); | ||
| List<String> colName = new ArrayList<>(numCols); | ||
| for (int i = 0; i < numCols; i++) { | ||
| colName.add(getUnescapedName((ASTNode) tree.getChild(2).getChild(i))); | ||
| } | ||
| return colName; | ||
| default: | ||
| throw new SemanticException("Internal error. Expected number of children of ASTNode to be" | ||
| + " either 2 or 3. Found : " + tree.getChildCount()); | ||
| private List<String> getExplicitColumnNamesFromAst(ASTNode tree) throws SemanticException { | ||
| // The parser stores this statement as three pieces in order: which table (or partition) to | ||
| // analyze, a flag that this is column-level stats (not scanning the whole table for table | ||
| // stats alone), then the listed column names from "FOR COLUMNS (a, b, ...)". That layout is the reason | ||
| // we expect exactly three children and read the identifiers from the last one. | ||
| if (tree.getChildCount() != 3) { | ||
| throw new SemanticException("Internal error. Expected number of children of ASTNode should be 3. Found : " | ||
| + tree.getChildCount()); | ||
| } | ||
| int numCols = tree.getChild(2).getChildCount(); | ||
| List<String> colName = new ArrayList<>(numCols); | ||
| for (int i = 0; i < numCols; i++) { | ||
| colName.add(getUnescapedName((ASTNode) tree.getChild(2).getChild(i))); | ||
| } | ||
| return colName; | ||
| } | ||
|
|
||
| private void handlePartialPartitionSpec(Map<String, String> partSpec, ColumnStatsAutoGatherContext context) throws | ||
|
|
@@ -217,34 +216,33 @@ | |
| throw new RuntimeException("Unknown partition key : " + partKey); | ||
| } | ||
|
|
||
| protected static List<String> getColumnTypes(Table tbl, List<String> colNames) { | ||
| List<String> colTypes = new ArrayList<>(); | ||
| protected static List<FieldSchema> getFieldSchemasByColName(Table tbl, List<String> colNames) { | ||
| List<FieldSchema> cols = tbl.getCols(); | ||
| List<String> copyColNames = new ArrayList<>(colNames); | ||
|
|
||
| for (String colName : copyColNames) { | ||
| for (FieldSchema col : cols) { | ||
| if (colName.equalsIgnoreCase(col.getName())) { | ||
| String type = col.getType(); | ||
| TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type); | ||
| boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo); | ||
| if (!isSupported) { | ||
| logTypeWarning(colName, type); | ||
| colNames.remove(colName); | ||
| } else { | ||
| colTypes.add(type); | ||
| } | ||
| Map<String, FieldSchema> colFsMap = new HashMap<>(); | ||
| for (FieldSchema col : cols) { | ||
| colFsMap.put(col.getName().toLowerCase(), col); | ||
| } | ||
| List<FieldSchema> result = new ArrayList<>(); | ||
| for (String colName : colNames) { | ||
| FieldSchema fs = colFsMap.get(colName.toLowerCase()); | ||
| if (fs != null) { | ||
| String type = fs.getType(); | ||
| TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type); | ||
| boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo); | ||
| if (!isSupported) { | ||
| logTypeWarning(colName, type); | ||
| } else { | ||
| result.add(new FieldSchema(colName, type, fs.getComment())); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return colTypes; | ||
| return result; | ||
| } | ||
|
|
||
| private String genRewrittenQuery(List<String> colNames, List<String> colTypes, HiveConf conf, | ||
| private String genRewrittenQuery(FieldSchemas columnSchemas, HiveConf conf, | ||
| List<TransformSpec> partTransformSpec, int specId, Map<String, String> partSpec, | ||
| boolean isPartitionStats) { | ||
| String rewritten = genRewrittenQuery(tbl, colNames, colTypes, conf, partTransformSpec, specId, partSpec, | ||
| String rewritten = genRewrittenQuery(tbl, columnSchemas, conf, partTransformSpec, specId, partSpec, | ||
| isPartitionStats, false); | ||
| isRewritten = true; | ||
| return rewritten; | ||
|
|
@@ -257,28 +255,27 @@ | |
| protected static String genRewrittenQuery(Table tbl, | ||
| HiveConf conf, List<TransformSpec> partTransformSpec, Map<String, String> partSpec, | ||
| boolean isPartitionStats) { | ||
| List<String> colNames = getColumnNamesSupportingStats(tbl); | ||
| List<String> colTypes = ColumnStatsSemanticAnalyzer.getColumnTypes(tbl, colNames); | ||
| return ColumnStatsSemanticAnalyzer.genRewrittenQuery( | ||
| tbl, colNames, colTypes, conf, partTransformSpec, -1, partSpec, isPartitionStats, true); | ||
| return ColumnStatsSemanticAnalyzer.genRewrittenQuery(tbl, getStatsEligibleFieldSchemas(tbl), conf, | ||
| partTransformSpec, -1, partSpec, isPartitionStats, true); | ||
| } | ||
|
|
||
| private static String genRewrittenQuery(Table tbl, List<String> colNames, List<String> colTypes, | ||
| private static String genRewrittenQuery(Table tbl, FieldSchemas columnSchemas, | ||
|
Check failure on line 262 in ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java
|
||
| HiveConf conf, List<TransformSpec> partTransformSpec, int specId, Map<String, String> partSpec, | ||
| boolean isPartitionStats, boolean useTableValues) { | ||
| StringBuilder rewrittenQueryBuilder = new StringBuilder("select "); | ||
|
|
||
| StringBuilder columnNamesBuilder = new StringBuilder(); | ||
| StringBuilder columnDummyValuesBuilder = new StringBuilder(); | ||
| for (int i = 0; i < colNames.size(); i++) { | ||
| for (int i = 0; i < columnSchemas.size(); i++) { | ||
| if (i > 0) { | ||
| rewrittenQueryBuilder.append(", "); | ||
| columnNamesBuilder.append(", "); | ||
| columnDummyValuesBuilder.append(", "); | ||
| } | ||
|
|
||
| final String columnName = unparseIdentifier(colNames.get(i), conf); | ||
| final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(colTypes.get(i)); | ||
| FieldSchema columnSchema = columnSchemas.get(i); | ||
| final String columnName = unparseIdentifier(columnSchema.getName(), conf); | ||
| final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(columnSchema.getType()); | ||
|
|
||
| try { | ||
| genComputeStats(rewrittenQueryBuilder, conf, i, columnName, typeInfo); | ||
|
|
@@ -634,31 +631,28 @@ | |
| */ | ||
| if (shouldRewrite(ast)) { | ||
| tbl = AnalyzeCommandUtils.getTable(ast, this); | ||
| colNames = getColumnName(ast); | ||
| // Save away the original AST | ||
| originalTree = ast; | ||
| boolean isPartitionStats = AnalyzeCommandUtils.isPartitionLevelStats(ast) | ||
| || StatsUtils.isPartitionStats(tbl, conf); | ||
|
|
||
| Map<Integer, List<TransformSpec>> partTransformSpecs = Collections.singletonMap(-1, null); | ||
| Map<String, String> partSpec = (isPartitionStats) ? | ||
| AnalyzeCommandUtils.getPartKeyValuePairsFromAST(tbl, ast, conf) : null; | ||
| checkForPartitionColumns( | ||
| colNames, Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys())); | ||
| validateSpecifiedColumnNames(colNames); | ||
|
|
||
| List<FieldSchema> columnSchemas = getColumnsFromAst(ast); | ||
|
|
||
| if (isPartitionStats) { | ||
| handlePartialPartitionSpec(partSpec, null); | ||
| if (tbl.hasNonNativePartitionSupport()) { | ||
| partTransformSpecs = tbl.getStorageHandler().getPartitionTransformSpecs(tbl); | ||
| } | ||
| } | ||
| colType = getColumnTypes(tbl, colNames); | ||
| rewrittenColumnSchemas = new FieldSchemas(columnSchemas); | ||
| isTableLevel = !isPartitionStats; | ||
|
|
||
| rewrittenQuery = String.join(" union all ", | ||
| Maps.transformEntries(partTransformSpecs, (specId, partTransformSpec) -> | ||
| genRewrittenQuery(colNames, colType, conf, partTransformSpec, specId, partSpec, isPartitionStats)) | ||
| genRewrittenQuery(rewrittenColumnSchemas, conf, partTransformSpec, specId, partSpec, isPartitionStats)) | ||
| .values()); | ||
|
|
||
| rewrittenTree = genRewrittenTree(rewrittenQuery); | ||
|
|
@@ -677,8 +671,7 @@ | |
| analyzeRewrite = new AnalyzeRewriteContext(); | ||
| analyzeRewrite.setTableName(tbl.getFullyQualifiedName()); | ||
| analyzeRewrite.setTblLvl(isTableLevel); | ||
| analyzeRewrite.setColName(colNames); | ||
| analyzeRewrite.setColType(colType); | ||
| analyzeRewrite.setFieldSchemas(rewrittenColumnSchemas); | ||
| qbp.setAnalyzeRewrite(analyzeRewrite); | ||
| origCtx.addSubContext(ctx); | ||
| initCtx(ctx); | ||
|
|
@@ -709,15 +702,13 @@ | |
|
|
||
| tbl = AnalyzeCommandUtils.getTable(ast, this); | ||
|
|
||
| colNames = getColumnName(ast); | ||
| boolean isPartitionStats = AnalyzeCommandUtils.isPartitionLevelStats(ast) | ||
| || StatsUtils.isPartitionStats(tbl, conf); | ||
|
|
||
| List<TransformSpec> partTransformSpec = null; | ||
| Map<String, String> partSpec = null; | ||
| checkForPartitionColumns(colNames, | ||
| Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys())); | ||
| validateSpecifiedColumnNames(colNames); | ||
|
|
||
| List<FieldSchema> columnSchemas = getColumnsFromAst(ast); | ||
|
|
||
| if (isPartitionStats) { | ||
| partSpec = AnalyzeCommandUtils.getPartKeyValuePairsFromAST(tbl, ast, conf); | ||
|
|
@@ -726,33 +717,46 @@ | |
| partTransformSpec = tbl.getStorageHandler().getPartitionTransformSpec(tbl); | ||
| } | ||
| } | ||
| colType = getColumnTypes(tbl, colNames); | ||
| rewrittenColumnSchemas = new FieldSchemas(columnSchemas); | ||
| isTableLevel = !isPartitionStats; | ||
|
|
||
| rewrittenQuery = genRewrittenQuery(colNames, colType, conf, partTransformSpec, -1, | ||
| rewrittenQuery = genRewrittenQuery(rewrittenColumnSchemas, conf, partTransformSpec, -1, | ||
| partSpec, isPartitionStats); | ||
| rewrittenTree = genRewrittenTree(rewrittenQuery); | ||
|
|
||
| return rewrittenTree; | ||
| } | ||
|
|
||
| protected List<FieldSchema> getColumnsFromAst(ASTNode ast) throws SemanticException { | ||
| List<FieldSchema> statsEligibleFS = null; | ||
| List<String> columnNames; | ||
| if (ast.getChildCount() == 2) { | ||
| FieldSchemas eligibleFS = getStatsEligibleFieldSchemas(tbl); | ||
| statsEligibleFS = eligibleFS.getSchemas(); | ||
| columnNames = eligibleFS.getColName(); | ||
| } else { | ||
| columnNames = getExplicitColumnNamesFromAst(ast); | ||
| } | ||
|
|
||
| checkForPartitionColumns(columnNames, Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys())); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tanishq-chugh , can you please check I haven't gone though the full PR yet, just wanted to hightlight.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Made this change in commit: 68446bd |
||
| validateSpecifiedColumnNames(columnNames); | ||
|
|
||
| return statsEligibleFS != null ? statsEligibleFS : getFieldSchemasByColName(tbl, columnNames); | ||
| } | ||
|
|
||
| AnalyzeRewriteContext getAnalyzeRewriteContext() { | ||
| AnalyzeRewriteContext analyzeRewrite = new AnalyzeRewriteContext(); | ||
| analyzeRewrite.setTableName(tbl.getFullyQualifiedName()); | ||
| analyzeRewrite.setTblLvl(isTableLevel); | ||
| analyzeRewrite.setColName(colNames); | ||
| analyzeRewrite.setColType(colType); | ||
| analyzeRewrite.setFieldSchemas(rewrittenColumnSchemas); | ||
| return analyzeRewrite; | ||
| } | ||
|
|
||
| static AnalyzeRewriteContext genAnalyzeRewriteContext(HiveConf conf, Table tbl) { | ||
| AnalyzeRewriteContext analyzeRewrite = new AnalyzeRewriteContext(); | ||
| analyzeRewrite.setTableName(tbl.getFullyQualifiedName()); | ||
| analyzeRewrite.setTblLvl(!(conf.getBoolVar(ConfVars.HIVE_STATS_COLLECT_PART_LEVEL_STATS) && tbl.isPartitioned())); | ||
| List<String> colNames = getColumnNamesSupportingStats(tbl); | ||
| List<String> colTypes = getColumnTypes(tbl, colNames); | ||
| analyzeRewrite.setColName(colNames); | ||
| analyzeRewrite.setColType(colTypes); | ||
| analyzeRewrite.setFieldSchemas(getStatsEligibleFieldSchemas(tbl)); | ||
| return analyzeRewrite; | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tanishq-chugh, @thomasrebele :
if (tree.getChildCount() != 3)looks like black magic to me at first sight :) , can you help the future code readers with a small code comment here regarding why only 3 is correct? I'm going to accept it, just need an explanationThe below exception message "Expected number of children of ASTNode should be 3" doesn't help at all
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a comment to explain the behaviour in commit : 2c7c593
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice, thank you very much!