create table if not exists dumdum (val map<string,map<string,struct<student_id:string,age:int>>>);
insert into dumdum select map('A',map('1',named_struct('student_id','123a', 'age',11)));
insert into dumdum select map('B',map('2',named_struct('student_id','987z', 'age',11)));
select * from dumdum;
{"A":{"1":{"student_id":"123a","age":11}}}
{"B":{"2":{"student_id":"987z","age":11}}}
select some_udf(val) from dumdum;
["123a","987z"]
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.*;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class CustomUDF extends GenericUDF {
private MapObjectInspector inputMapOI = null;
private Converter inputMapKeyConverter = null;
private MapObjectInspector inputMapValueMapOI = null;
private Converter inputMapValueConverter;
@Override
public String getDisplayString(String[] arguments) {
return "my udf";
}
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if ((null == arguments) || (arguments.length != 1)) {
throw new UDFArgumentLengthException("1 arguments are expected.");
}
if (!(arguments[0] instanceof MapObjectInspector)) {
throw new UDFArgumentException("The first parameter should be a map object ");
}
inputMapOI = (MapObjectInspector) arguments[0];
ObjectInspector mapKeyOI = PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING);
inputMapKeyConverter = ObjectInspectorConverters.getConverter(this.inputMapOI.getMapKeyObjectInspector(), mapKeyOI);
if (!(inputMapOI.getMapValueObjectInspector() instanceof MapObjectInspector)) {
throw new UDFArgumentException("The map value type must be a map ");
}
inputMapValueMapOI = (MapObjectInspector) this.inputMapOI.getMapValueObjectInspector();
List<String> structFieldNames = new ArrayList<String>();
structFieldNames.add("student_id");
structFieldNames.add("age");
List<ObjectInspector> structFieldObjectInspectors = new ArrayList<ObjectInspector>();
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
ObjectInspector inputMapElementOI = inputMapValueMapOI.getMapValueObjectInspector();
ObjectInspector outputMapElementOI = ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, structFieldObjectInspectors);
inputMapValueConverter = ObjectInspectorConverters.getConverter(inputMapElementOI, outputMapElementOI);
return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
if ((null == arguments) || (arguments.length != 1)) {
throw new UDFArgumentLengthException("1 argument is expected.");
}
Map<?, ?> map = inputMapOI.getMap(arguments[0].get());
List<String> dataList = new ArrayList<String>();
for (Object key : map.keySet()) {
Map<?, ?> valueMap = this.inputMapValueMapOI.getMap(map.get(key));
if ((valueMap == null) || (valueMap.size() == 0)) {
continue;
}
for (Object value : valueMap.keySet()) {
try{
String innerkey = (String) this.inputMapKeyConverter.convert(value);
System.out.println("Got "+innerKey);
Map<?, ?> innerMap = (Map<?, ?>) this.inputMapValueMapOI.getMap(valueMap.get(key));
if ((innerMap == null) || (innerMap.size() == 0)) {
System.out.println("Got null");
continue;
}
for (Object struct : innerMap.keySet()) {
String strValue = (String) this.inputMapValueConverter.convert(struct);
StructField str = (StructField) inputMapValueConverter.convert(innerMap.get(strValue));
/*
Not sure what to do here. Maybe
str.getFieldID();
dataList.add(str.toString());
*/
}
}
catch (ClassCastException c){
System.out.println("Got ClassCastException");
}
}
}
return dataList;
}
}
add jar /path/to/my/jar;
CREATE TEMPORARY FUNCTION modudf AS 'some.package.CustomUDF';
select modudf(val) from dumdum;
Map<?, ?> innerMap = (Map<?, ?>) this.inputMapValueMapOI.getMap(valueMap.get(inner));
if ((innerMap == null) || (innerMap.size() == 0)) {
System.out.println("Got null");
continue;
}
我可以看到…的输出
System.out.println("Got "+innerKey);
在控制台上。
为什么我的转换器不能访问内部映射?
inputMapValueMapOI = (MapObjectInspector) this.inputMapOI.getMapValueObjectInspector();
List<String> structFieldNames = new ArrayList<String>();
structFieldNames.add("student_id");
structFieldNames.add("age");
List<ObjectInspector> structFieldObjectInspectors = new ArrayList<ObjectInspector>();
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
structOI = ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, structFieldObjectInspectors);
String innerkey = (String) inputMapKeyConverter.convert(value);
System.out.println(innerKey);
Map<?, ?> innerMap = (Map<?, ?>) this.inputMapValueMapOI.getMap(valueMap.get(innerkey));
if ((innerMap == null) || (innerMap.size() == 0)) {
System.out.println("null inner map");
continue;
}
for (Object struct : innerMap.keySet()) {
String ikey = (String) inputMapKeyConverter.convert(struct);
Object obj = structOI.getStructFieldData(innerMap.get(ikey), structOI.getStructFieldRef("student_id"));
dataList.add(obj.toString());
}
null inner map
我是否没有正确定义内部地图检查器?
我建议您不要使用转换器,只需为内部映射定义第二个MapObjectInspecter
,获取外部映射值并调用getMap
就像第一个映射一样。要获得结构值,需要在initialize
中定义structobjectInspector
类型的变量,例如。
StructObjectInspector soi = ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, structFieldObjectInspectors)
然后
Object obj = soi.getStructFieldData(innerMapValue, soi.getStructFieldRef("student_id"))
更新:尝试将映射键转换为标准键,如下所示
private Map stdKeys(Map inspectMap) {
Map objMap = new HashMap();
for (Object inspKey : inspectMap.keySet()) {
Object objKey = ((PrimitiveObjectInspector) mapInspector.getMapKeyObjectInspector()).getPrimitiveJavaObject(inspKey);
objMap.put(objKey, inspKey);
}
return objMap;
}
我在Select query where条件下执行了带有自定义配置单元UDF函数的配置单元SQL脚本,它已经运行了两天多。我想知道这里到底有什么问题?调用java需要很多时间,还是查询执行本身需要很多时间? 我的数据集如下,A表有200万条记录,B表有100万条记录,
添加/home/cloudera/date.jar到类路径添加资源:/home/cloudera/date.jar 请有人帮帮我,因为我是新来蜂巢的。有人能告诉我要遵循的步骤吗
我看到这样一个udf: https://github.com/edwardcapriolo/hive-geoip 如何在Hive中利用udf?我可以自己创建函数名吗?
我在java中开发了一个工作正常的配置单元udf,我的函数返回输入与配置单元表中列之间的最佳匹配,因此它有以下简化的伪代码: 我的问题是,如果这个函数是由Hive调用的,为什么我需要在代码中连接到Hive?我可以使用使用我的功能的用户所连接的当前连接吗?
null 我将把所有三个文件转换成标准格式(文件1格式--一个有4列的输出)。要转换为标准格式,我需要引用文件第一行中的头记录。因此,如果我的输入文件是256MB,并且调用了多个映射器,是否有任何方法使每个映射器可以引用一个全局变量(头信息)。 简而言之,是否有一种方法为所有调用我的配置单元UDF的映射器提供一个公共变量? 注意:UDF将在单个列表上运行,读取完整的行,然后将其写入下一个表HDFS
TL;DR:我如何在Hive中更新自定义UDF的jar? 我写了自己的(通用)udf,工作得很好。我可以定义一个新函数,并将其与命令一起使用: null null