当前位置: 首页 > 知识库问答 >
问题:

Spark dataframe如何使用Seq[String]选择列

谈禄
2023-03-14

Input schema 
root
 |-- class: string (nullable = true)
 |-- createdBy: string (nullable = true)
 |-- createdDate: struct (nullable = true)
 |    |-- $date: long (nullable = true)
 |-- id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- lastModifiedBy: string (nullable = true)
 |-- lastModifiedDate: struct (nullable = true)
 |    |-- $date: long (nullable = true)
 |-- planId: string (nullable = true)
 |-- planWeekDataFormatted: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bbDemoImps: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- bbDemoImpsAttributes: struct (nullable = true)
 |    |    |    |    |    |-- demoId: string (nullable = true)
 |    |    |    |    |    |-- imps: long (nullable = true)
 |    |    |    |    |    |-- ue: long (nullable = true)
 |    |    |    |    |-- uuid: long (nullable = true)
 |    |    |-- demoValues: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- demoAttributes: struct (nullable = true)
 |    |    |    |    |    |-- cpm: long (nullable = true)
 |    |    |    |    |    |-- cpp: long (nullable = true)
 |    |    |    |    |    |-- demoId: string (nullable = true)
 |    |    |    |    |    |-- grps: long (nullable = true)
 |    |    |    |    |    |-- imps: long (nullable = true)
 |    |    |    |    |    |-- rcImps: long (nullable = true)
 |    |    |    |    |    |-- totalCpm: long (nullable = true)
 |    |    |    |    |    |-- totalGrps: long (nullable = true)
 |    |    |    |    |    |-- totalImps: long (nullable = true)
 |    |    |    |    |    |-- ue: long (nullable = true)
 |    |    |    |    |    |-- vpvh: long (nullable = true)
 |    |    |    |    |-- demoId: long (nullable = true)
 |    |    |-- hhDemo: struct (nullable = true)
 |    |    |    |-- demoId: string (nullable = true)
 |    |    |    |-- imps: long (nullable = true)
 |    |    |    |-- ue: long (nullable = true)
 |    |    |-- periodId: string (nullable = true)
 |    |    |-- rcPublishedDate: string (nullable = true)
 |    |    |-- unitRates: struct (nullable = true)
 |    |    |    |-- rate: long (nullable = true)
 |    |    |    |-- rcRate: long (nullable = true)
 |    |    |    |-- totalRate: long (nullable = true)
 |    |    |    |-- units: string (nullable = true)
 |    |    |-- uuid: long (nullable = true)
 |    |    |-- weekStartDate: long (nullable = true)
 |-- planWorkspaceProduct: struct (nullable = true)
 |    |-- channelId: string (nullable = true)
 |    |-- commercialTypeId: string (nullable = true)
 |    |-- lineClassAttributes: struct (nullable = true)
 |    |    |-- canExport: boolean (nullable = true)
 |    |    |-- canInvoice: boolean (nullable = true)
 |    |    |-- canProduce: boolean (nullable = true)
 |    |    |-- guaranteedAudience: long (nullable = true)
 |    |    |-- guaranteedRate: long (nullable = true)
 |    |    |-- hasPerformance: boolean (nullable = true)
 |    |    |-- planAudience: long (nullable = true)
 |    |    |-- planRate: long (nullable = true)
 |    |-- lineClassId: string (nullable = true)
 |    |-- lineId: string (nullable = true)
 |    |-- lineNo: struct (nullable = true)
 |    |    |-- $numberLong: string (nullable = true)
 |    |-- planProductId: string (nullable = true)
 |    |-- productId: string (nullable = true)
 |    |-- spotLengthId: string (nullable = true)
 |-- rates: struct (nullable = true)
 |    |-- period: struct (nullable = true)
 |    |    |-- endDate: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- startDate: long (nullable = true)
 |-- version: struct (nullable = true)
 |    |-- $numberLong: string (nullable = true)
 |-- offsets: integer (nullable = true)
 |-- modifiedTime: long (nullable = true)
 |-- opCode: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- tenant: string (nullable = true)
 |-- etl_timestamp: long (nullable = false)
 |-- topic: string (nullable = true)

预期输出架构

root
 |-- class: string (nullable = true)
 |-- createdBy: string (nullable = true)
 |-- lastModifiedBy: string (nullable = true)
 |-- planId: string (nullable = true)
 |-- offsets: integer (nullable = true)
 |-- modifiedTime: long (nullable = true)
 |-- opCode: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- tenant: string (nullable = true)
 |-- etl_timestamp: long (nullable = false)
 |-- topic: string (nullable = true)
 |-- createdDate_$date: long (nullable = true)
 |-- id_$oid: string (nullable = true)
 |-- lastModifiedDate_$date: long (nullable = true)
 |-- planWorkspaceProduct_channelId: string (nullable = true)
 |-- planWorkspaceProduct_commercialTypeId: string (nullable = true)
 |-- planWorkspaceProduct_lineClassId: string (nullable = true)
 |-- planWorkspaceProduct_lineId: string (nullable = true)
 |-- planWorkspaceProduct_planProductId: string (nullable = true)
 |-- planWorkspaceProduct_productId: string (nullable = true)
 |-- planWorkspaceProduct_spotLengthId: string (nullable = true)
 |-- version_$numberLong: string (nullable = true)
 |-- planWeekDataFormatted_periodId: string (nullable = true)
 |-- planWeekDataFormatted_rcPublishedDate: string (nullable = true)
 |-- planWeekDataFormatted_weekStartDate: long (nullable = true)
 |-- planWorkspaceProduct_lineClassAttributes_canExport: boolean (nullable = true)
 |-- planWorkspaceProduct_lineClassAttributes_canInvoice: boolean (nullable = true)
 |-- planWorkspaceProduct_lineClassAttributes_canProduce: boolean (nullable = true)
 |-- planWorkspaceProduct_lineClassAttributes_guaranteedAudience: long (nullable = true)
 |-- planWorkspaceProduct_lineClassAttributes_guaranteedRate: long (nullable = true)
 |-- planWorkspaceProduct_lineClassAttributes_hasPerformance: boolean (nullable = true)
 |-- planWorkspaceProduct_lineClassAttributes_planAudience: long (nullable = true)
 |-- planWorkspaceProduct_lineClassAttributes_planRate: long (nullable = true)
 |-- planWorkspaceProduct_lineNo_$numberLong: string (nullable = true)
 |-- rates_period_endDate: long (nullable = true)
 |-- rates_period_name: string (nullable = true)
 |-- rates_period_startDate: long (nullable = true)
 **|-- planWeekDataFormatted_hhDemo_demoId: string (nullable = true)**
 |-- planWeekDataFormatted_unitRates_rate: long (nullable = true)
 |-- planWeekDataFormatted_unitRates_rcRate: long (nullable = true)
 |-- planWeekDataFormatted_unitRates_totalRate: long (nullable = true)
 |-- planWeekDataFormatted_unitRates_units: string (nullable = true)
 **|-- planWeekDataFormatted_bbDemoImps_bbDemoImpsAttributes_demoId: string (nullable = true)**
 **|-- planWeekDataFormatted_demoValues_demoAttributes_demoId: string (nullable = true)**

尝试使用下面的代码来分解ArrayType列“plan WeekDataFormatted”,然后是嵌套的ArrayType列“bbDemoImps”、“demoValue”,并尝试仅从数组中的每个对象中提取demoIds。

//get all columns from resultDF, except "planWeekDataFormatted" column
    val dfwithoutPlanWeekData = resultDF.drop("planWeekDataFormatted")
    val colsWithoutPlanWeekData = dfwithoutPlanWeekData.columns.toSeq

val planweek_exploded = resultDF.withColumn("planWeekItem", explode($"planWeekDataFormatted"))
      .withColumn("bbDemoImpsAttribute", explode($"planWeekItem.bbDemoImps"))
      .withColumn("demoValuesAttribute", explode($"planWeekItem.demoValues"))
      .withColumn("hhDemoAttribute", $"planWeekItem.hhDemo")
      .select(
        colsWithoutPlanWeekData.map(c => col(c)): _*,
        col("bbDemoImpsAttribute.bbDemoImpsAttributes.demoId").as("bbDemoId"),
        col("demoValuesAttribute.demoAttributes.demoId").as("demoId"),
        col("hhDemoAttribute.demoId").as("hhDemoId")
      ).drop("planWeekItem", "bbDemoImpsAttribute", "demoValuesAttribute", "hhDemoAttribute")

不允许Spark dataframe从Seq[String]中选择映射列

获取以下错误

> overloaded method value select with alternatives:   [U1, U2, U3,
> U4](c1: org.apache.spark.sql.TypedColumn[org.apache.spark.sql.Row,U1],
> c2: org.apache.spark.sql.TypedColumn[org.apache.spark.sql.Row,U2], c3:
> org.apache.spark.sql.TypedColumn[org.apache.spark.sql.Row,U3], c4:
> org.apache.spark.sql.TypedColumn[org.apache.spark.sql.Row,U4])org.apache.spark.sql.Dataset[(U1,
> U2, U3, U4)] <and>   (col: String,cols:
> String*)org.apache.spark.sql.DataFrame <and>   (cols:
> org.apache.spark.sql.Column*)org.apache.spark.sql.DataFrame  cannot be
> applied to (String, org.apache.spark.sql.Column,
> org.apache.spark.sql.Column, org.apache.spark.sql.Column)
>       .select(

共有1个答案

公良俊楚
2023-03-14

使用:

.select(
        (colsWithoutPlanWeekData.map(c => col(c)) ++ Seq(
        col("bbDemoImpsAttribute.bbDemoImpsAttributes.demoId").as("bbDemoId"),
        col("demoValuesAttribute.demoAttributes.demoId").as("demoId"),
        col("hhDemoAttribute.demoId").as("hhDemoId"))): _*
)

在使用语法糖之前,先连接2:

 类似资料:
  • 我试图使用以下公式将Future[Seq[(String,String)]转换为Future[Seq[(String)]: 所以 sortedSeq 是 Future[Seq[(String, String)]] 但我一直得到错误: 我做错了什么?

  • 我知道scala.collection包中有两个极其有用的对象,可以帮助我们实现这个目标: JavaConverters(如果我想明确地说出我想要转换的内容) JavaConversions(如果我不想共同控制转换,让编译器为我做隐式工作) 但是在我的案例中应用它们有一些困难,因为我的数据结构比我在许多示例中看到的其他数据结构要复杂一点。 我在scala代码中,我希望我的scala函数返回一个Ja

  • 问题内容: 我想使用Hibernate选择单列而不是整个对象。到目前为止,我有这个: 我的问题是上述代码将整个People表返回为一个对象,而不仅仅是“ firstname”。我不确定如何指定仅返回“名字”而不是整个对象。 问题答案: 您可以像这样设置投影: 有了这个,您只能得到名字的回报。 我在同一情况下在堆栈上找到了另一个链接。希望这也将对您有所帮助。

  • 问题内容: 我只想通过“ where子句”从数据库中选择(并返回)一个字段。代码是: 这将失败,并且回溯是: 如何选择并仅返回“ id”列?我也尝试了其他几种方法,但也失败了。是“ load_only”正确的选项吗? 问题答案: 一个对象接受实体查询作为位置参数,所以只是通过它: 返回第一个结果的第一个元素;如果未找到任何行,则返回None。它为多行引发异常。 表示仅应加载实体的给定基于列的属性,

  • 我有一些连接的选择工作良好。顺便说一句,我想把这些选择转换成很小的选择,但我发现这样做有些困难。例如,我有一个ID为的select。当我选择的一个大于零的选项时,应该会出现其他选择。之后,当我更改级联选择的选项(例如在中)时,该选项会正确更改。但是,当我选择另一次时,中的选项zero和我选择另一次时,中大于zero的选项会出现,而另一次选择中已经选择了选项。 这是我的javascript代码: 这

  • 我正在尝试使用selenium和Python单击这个单选按钮。 我有 但它不允许我点击它。我如何使用名称、值或类、值等的组合来点击? 关于如何使用硒,是否有一个很好的信息来源?因为我所发现的大部分是在java上使用的,而我使用的是Python。 编辑:使用XPATH