在Spark中,遍历每一列并找到最大长度的方法可以通过以下步骤实现:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
val data = Seq(
("John", "Doe", 25),
("Jane", "Smith", 30),
("Bob", "Johnson", 35)
)
val schema = StructType(Seq(
StructField("first_name", StringType, nullable = false),
StructField("last_name", StringType, nullable = false),
StructField("age", IntegerType, nullable = false)
))
val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
def getMaxColumnLength(df: DataFrame): Map[String, Int] = {
val columnLengths = df.schema.fields.map { field =>
val columnName = field.name
val columnType = field.dataType
val maxLength = df.select(length(col(columnName))).as[Int].collect.max
(columnName, maxLength)
}
columnLengths.toMap
}
val maxColumnLengths = getMaxColumnLength(df)
maxColumnLengths.foreach { case (columnName, maxLength) =>
println(s"Max length of column '$columnName': $maxLength")
}
这样就可以遍历每一列并找到最大长度。请注意,这个方法适用于字符串类型的列,对于其他类型的列可能需要进行适当的修改。
领取专属 10元无门槛券
手把手带您无忧上云