Skip to content
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -2286,6 +2286,10 @@ public static List<String> getColumnNamesFromFieldSchema(List<FieldSchema> partC
return names;
}

public static List<String> getColumnTypesFromFieldSchema(List<FieldSchema> fieldSchemas) {
return fieldSchemas.stream().map(FieldSchema::getType).toList();
}

public static List<String> getInternalColumnNamesFromSignature(List<ColumnInfo> colInfos) {
List<String> names = new ArrayList<String>();
for (ColumnInfo ci : colInfos) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.hadoop.hive.ql.parse;

import java.io.IOException;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
Expand Down Expand Up @@ -1413,11 +1414,54 @@ public String toString() {
}
}

/**
* Holds table column {@link FieldSchema} entries and lazily derived parallel name/type string
* lists for analyze / column-stats compilation.
*/
public static final class FieldSchemas implements Serializable {

private static final long serialVersionUID = 1L;

private final List<FieldSchema> schemas;

private transient List<String> colNames;
private transient List<String> colTypes;

public FieldSchemas(List<FieldSchema> schemas) {
this.schemas = schemas != null ? schemas : Collections.emptyList();
}

public List<FieldSchema> getSchemas() {
return schemas;
}

public int size() {
return schemas.size();
}

public FieldSchema get(int index) {
return schemas.get(index);
}

public List<String> getColName() {
if (colNames == null) {
colNames = Utilities.getColumnNamesFromFieldSchema(schemas);
}
return colNames;
}

public List<String> getColType() {
if (colTypes == null) {
colTypes = Utilities.getColumnTypesFromFieldSchema(schemas);
}
return colTypes;
}
}

public static class AnalyzeRewriteContext {

private String tableName;
private List<String> colName;
private List<String> colType;
private FieldSchemas fieldSchemas;
private boolean tblLvl;

public String getTableName() {
Expand All @@ -1428,12 +1472,12 @@ public void setTableName(String tableName) {
this.tableName = tableName;
}

public List<String> getColName() {
return colName;
public FieldSchemas getFieldSchemas() {
return fieldSchemas;
}

public void setColName(List<String> colName) {
this.colName = colName;
public void setFieldSchemas(FieldSchemas fieldSchemas) {
this.fieldSchemas = fieldSchemas;
}

public boolean isTblLvl() {
Expand All @@ -1444,14 +1488,6 @@ public void setTblLvl(boolean isTblLvl) {
this.tblLvl = isTblLvl;
}

public List<String> getColType() {
return colType;
}

public void setColType(List<String> colType) {
this.colType = colType;
}

}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import com.google.common.base.Preconditions;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
Expand Down Expand Up @@ -76,8 +77,7 @@
private boolean isRewritten;

private boolean isTableLevel;
private List<String> colNames;
private List<String> colType;
private FieldSchemas rewrittenColumnSchemas;
private Table tbl;

public ColumnStatsSemanticAnalyzer(QueryState queryState) throws SemanticException {
Expand All @@ -103,37 +103,36 @@
}

/**
* Get the names of the columns that support column statistics.
* Get the Field Schemas of the columns that support column statistics.
*/
private static List<String> getColumnNamesSupportingStats(Table tbl) {
List<String> colNames = new ArrayList<>();
private static FieldSchemas getStatsEligibleFieldSchemas(Table tbl) {
List<FieldSchema> result = new ArrayList<>();
for (FieldSchema col : tbl.getCols()) {
String type = col.getType();
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo);
if (isSupported) {
colNames.add(col.getName());
result.add(col);
}
}
return colNames;
return new FieldSchemas(result);
}

private List<String> getColumnName(ASTNode tree) throws SemanticException {

switch (tree.getChildCount()) {
case 2:
return getColumnNamesSupportingStats(tbl);
case 3:
int numCols = tree.getChild(2).getChildCount();
List<String> colName = new ArrayList<>(numCols);
for (int i = 0; i < numCols; i++) {
colName.add(getUnescapedName((ASTNode) tree.getChild(2).getChild(i)));
}
return colName;
default:
throw new SemanticException("Internal error. Expected number of children of ASTNode to be"
+ " either 2 or 3. Found : " + tree.getChildCount());
private List<String> getExplicitColumnNamesFromAst(ASTNode tree) throws SemanticException {
// The parser stores this statement as three pieces in order: which table (or partition) to
// analyze, a flag that this is column-level stats (not scanning the whole table for table
// stats alone), then the listed column names from "FOR COLUMNS (a, b, ...)". That layout is the reason
// we expect exactly three children and read the identifiers from the last one.
if (tree.getChildCount() != 3) {
Copy link
Copy Markdown
Contributor

@abstractdog abstractdog May 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tanishq-chugh, @thomasrebele : if (tree.getChildCount() != 3) looks like black magic to me at first sight :) , can you help the future code readers with a small code comment here regarding why only 3 is correct? I'm going to accept it, just need an explanation
The below exception message "Expected number of children of ASTNode should be 3" doesn't help at all

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a comment to explain the behaviour in commit : 2c7c593

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice, thank you very much!

throw new SemanticException("Internal error. Expected number of children of ASTNode should be 3. Found : "
+ tree.getChildCount());
}
int numCols = tree.getChild(2).getChildCount();
List<String> colName = new ArrayList<>(numCols);
for (int i = 0; i < numCols; i++) {
colName.add(getUnescapedName((ASTNode) tree.getChild(2).getChild(i)));
}
return colName;
}

private void handlePartialPartitionSpec(Map<String, String> partSpec, ColumnStatsAutoGatherContext context) throws
Expand Down Expand Up @@ -217,34 +216,33 @@
throw new RuntimeException("Unknown partition key : " + partKey);
}

protected static List<String> getColumnTypes(Table tbl, List<String> colNames) {
List<String> colTypes = new ArrayList<>();
protected static List<FieldSchema> getFieldSchemasByColName(Table tbl, List<String> colNames) {
List<FieldSchema> cols = tbl.getCols();
List<String> copyColNames = new ArrayList<>(colNames);

for (String colName : copyColNames) {
for (FieldSchema col : cols) {
if (colName.equalsIgnoreCase(col.getName())) {
String type = col.getType();
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo);
if (!isSupported) {
logTypeWarning(colName, type);
colNames.remove(colName);
} else {
colTypes.add(type);
}
Map<String, FieldSchema> colFsMap = new HashMap<>();
for (FieldSchema col : cols) {
colFsMap.put(col.getName().toLowerCase(), col);
}
List<FieldSchema> result = new ArrayList<>();
for (String colName : colNames) {
FieldSchema fs = colFsMap.get(colName.toLowerCase());
if (fs != null) {
String type = fs.getType();
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo);
if (!isSupported) {
logTypeWarning(colName, type);
} else {
result.add(new FieldSchema(colName, type, fs.getComment()));
}
}
}

return colTypes;
return result;
}

private String genRewrittenQuery(List<String> colNames, List<String> colTypes, HiveConf conf,
private String genRewrittenQuery(FieldSchemas columnSchemas, HiveConf conf,
List<TransformSpec> partTransformSpec, int specId, Map<String, String> partSpec,
boolean isPartitionStats) {
String rewritten = genRewrittenQuery(tbl, colNames, colTypes, conf, partTransformSpec, specId, partSpec,
String rewritten = genRewrittenQuery(tbl, columnSchemas, conf, partTransformSpec, specId, partSpec,
isPartitionStats, false);
isRewritten = true;
return rewritten;
Expand All @@ -257,28 +255,27 @@
protected static String genRewrittenQuery(Table tbl,
HiveConf conf, List<TransformSpec> partTransformSpec, Map<String, String> partSpec,
boolean isPartitionStats) {
List<String> colNames = getColumnNamesSupportingStats(tbl);
List<String> colTypes = ColumnStatsSemanticAnalyzer.getColumnTypes(tbl, colNames);
return ColumnStatsSemanticAnalyzer.genRewrittenQuery(
tbl, colNames, colTypes, conf, partTransformSpec, -1, partSpec, isPartitionStats, true);
return ColumnStatsSemanticAnalyzer.genRewrittenQuery(tbl, getStatsEligibleFieldSchemas(tbl), conf,
partTransformSpec, -1, partSpec, isPartitionStats, true);
}

private static String genRewrittenQuery(Table tbl, List<String> colNames, List<String> colTypes,
private static String genRewrittenQuery(Table tbl, FieldSchemas columnSchemas,

Check failure on line 262 in ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this method to reduce its Cognitive Complexity from 17 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ4XaLlp8BBPB4tllOdK&open=AZ4XaLlp8BBPB4tllOdK&pullRequest=6443

Check warning on line 262 in ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Method has 8 parameters, which is greater than 7 authorized.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ4XaLlp8BBPB4tllOdJ&open=AZ4XaLlp8BBPB4tllOdJ&pullRequest=6443
HiveConf conf, List<TransformSpec> partTransformSpec, int specId, Map<String, String> partSpec,
boolean isPartitionStats, boolean useTableValues) {
StringBuilder rewrittenQueryBuilder = new StringBuilder("select ");

StringBuilder columnNamesBuilder = new StringBuilder();
StringBuilder columnDummyValuesBuilder = new StringBuilder();
for (int i = 0; i < colNames.size(); i++) {
for (int i = 0; i < columnSchemas.size(); i++) {
if (i > 0) {
rewrittenQueryBuilder.append(", ");
columnNamesBuilder.append(", ");
columnDummyValuesBuilder.append(", ");
}

final String columnName = unparseIdentifier(colNames.get(i), conf);
final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(colTypes.get(i));
FieldSchema columnSchema = columnSchemas.get(i);
final String columnName = unparseIdentifier(columnSchema.getName(), conf);
final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(columnSchema.getType());

try {
genComputeStats(rewrittenQueryBuilder, conf, i, columnName, typeInfo);
Expand Down Expand Up @@ -634,31 +631,28 @@
*/
if (shouldRewrite(ast)) {
tbl = AnalyzeCommandUtils.getTable(ast, this);
colNames = getColumnName(ast);
// Save away the original AST
originalTree = ast;
boolean isPartitionStats = AnalyzeCommandUtils.isPartitionLevelStats(ast)
|| StatsUtils.isPartitionStats(tbl, conf);

Map<Integer, List<TransformSpec>> partTransformSpecs = Collections.singletonMap(-1, null);
Map<String, String> partSpec = (isPartitionStats) ?
AnalyzeCommandUtils.getPartKeyValuePairsFromAST(tbl, ast, conf) : null;
checkForPartitionColumns(
colNames, Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys()));
validateSpecifiedColumnNames(colNames);

List<FieldSchema> columnSchemas = getColumnsFromAst(ast);

if (isPartitionStats) {
handlePartialPartitionSpec(partSpec, null);
if (tbl.hasNonNativePartitionSupport()) {
partTransformSpecs = tbl.getStorageHandler().getPartitionTransformSpecs(tbl);
}
}
colType = getColumnTypes(tbl, colNames);
rewrittenColumnSchemas = new FieldSchemas(columnSchemas);
isTableLevel = !isPartitionStats;

rewrittenQuery = String.join(" union all ",
Maps.transformEntries(partTransformSpecs, (specId, partTransformSpec) ->
genRewrittenQuery(colNames, colType, conf, partTransformSpec, specId, partSpec, isPartitionStats))
genRewrittenQuery(rewrittenColumnSchemas, conf, partTransformSpec, specId, partSpec, isPartitionStats))
.values());

rewrittenTree = genRewrittenTree(rewrittenQuery);
Expand All @@ -677,8 +671,7 @@
analyzeRewrite = new AnalyzeRewriteContext();
analyzeRewrite.setTableName(tbl.getFullyQualifiedName());
analyzeRewrite.setTblLvl(isTableLevel);
analyzeRewrite.setColName(colNames);
analyzeRewrite.setColType(colType);
analyzeRewrite.setFieldSchemas(rewrittenColumnSchemas);
qbp.setAnalyzeRewrite(analyzeRewrite);
origCtx.addSubContext(ctx);
initCtx(ctx);
Expand Down Expand Up @@ -709,15 +702,13 @@

tbl = AnalyzeCommandUtils.getTable(ast, this);

colNames = getColumnName(ast);
boolean isPartitionStats = AnalyzeCommandUtils.isPartitionLevelStats(ast)
|| StatsUtils.isPartitionStats(tbl, conf);

List<TransformSpec> partTransformSpec = null;
Map<String, String> partSpec = null;
checkForPartitionColumns(colNames,
Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys()));
validateSpecifiedColumnNames(colNames);

List<FieldSchema> columnSchemas = getColumnsFromAst(ast);

if (isPartitionStats) {
partSpec = AnalyzeCommandUtils.getPartKeyValuePairsFromAST(tbl, ast, conf);
Expand All @@ -726,33 +717,46 @@
partTransformSpec = tbl.getStorageHandler().getPartitionTransformSpec(tbl);
}
}
colType = getColumnTypes(tbl, colNames);
rewrittenColumnSchemas = new FieldSchemas(columnSchemas);
isTableLevel = !isPartitionStats;

rewrittenQuery = genRewrittenQuery(colNames, colType, conf, partTransformSpec, -1,
rewrittenQuery = genRewrittenQuery(rewrittenColumnSchemas, conf, partTransformSpec, -1,
partSpec, isPartitionStats);
rewrittenTree = genRewrittenTree(rewrittenQuery);

return rewrittenTree;
}

protected List<FieldSchema> getColumnsFromAst(ASTNode ast) throws SemanticException {
List<FieldSchema> statsEligibleFS = null;
List<String> columnNames;
if (ast.getChildCount() == 2) {
FieldSchemas eligibleFS = getStatsEligibleFieldSchemas(tbl);
statsEligibleFS = eligibleFS.getSchemas();
columnNames = eligibleFS.getColName();
} else {
columnNames = getExplicitColumnNamesFromAst(ast);
}

checkForPartitionColumns(columnNames, Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys()));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tanishq-chugh , can you please check checkForPartitionColumns it also has nested for loop and validateSpecifiedColumnNames is using for loop + contains() which is also O(N*M). Both are used in getColumnsFromAst()

I haven't gone though the full PR yet, just wanted to hightlight.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made this change in commit: 68446bd

validateSpecifiedColumnNames(columnNames);

return statsEligibleFS != null ? statsEligibleFS : getFieldSchemasByColName(tbl, columnNames);
}

AnalyzeRewriteContext getAnalyzeRewriteContext() {
AnalyzeRewriteContext analyzeRewrite = new AnalyzeRewriteContext();
analyzeRewrite.setTableName(tbl.getFullyQualifiedName());
analyzeRewrite.setTblLvl(isTableLevel);
analyzeRewrite.setColName(colNames);
analyzeRewrite.setColType(colType);
analyzeRewrite.setFieldSchemas(rewrittenColumnSchemas);
return analyzeRewrite;
}

static AnalyzeRewriteContext genAnalyzeRewriteContext(HiveConf conf, Table tbl) {
AnalyzeRewriteContext analyzeRewrite = new AnalyzeRewriteContext();
analyzeRewrite.setTableName(tbl.getFullyQualifiedName());
analyzeRewrite.setTblLvl(!(conf.getBoolVar(ConfVars.HIVE_STATS_COLLECT_PART_LEVEL_STATS) && tbl.isPartitioned()));
List<String> colNames = getColumnNamesSupportingStats(tbl);
List<String> colTypes = getColumnTypes(tbl, colNames);
analyzeRewrite.setColName(colNames);
analyzeRewrite.setColType(colTypes);
analyzeRewrite.setFieldSchemas(getStatsEligibleFieldSchemas(tbl));
return analyzeRewrite;
}

Expand Down
Loading
Loading