Spark
import pyspark
def sort_and_replace_columns():
"""EXAMPLE, which creates new order (different order for different columns) and select only needed columns"""
session = pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.partitions', 4).getOrCreate()
df = session.createDataFrame(
[
(1, 1, 'a'),
(2, 3, 'b'),
(1, 2, 'd'),
(1, 7, 's'),
(2, 3, 'q'),
(1, 5, 'e'),
(2, 2, 'a')
],
('user', 'score', 'other column'))
df.show()
# >>>
# +----+-----+------------+
# |user|score|other column|
# +----+-----+------------+
# | 1| 1| a|
# | 2| 3| b|
# | 1| 2| d|
# | 1| 7| s|
# | 2| 3| q|
# | 1| 5| e|
# | 2| 2| a|
# +----+-----+------------+
ndf = df.orderBy(['user', 'score'], ascending=[1, 0])
ndf.select('user', 'other column', 'score').show()
# >>>
# +----+------------+-----+
# |user|other column|score|
# +----+------------+-----+
# | 1| s| 7|
# | 1| e| 5|
# | 1| d| 2|
# | 1| a| 1|
# | 2| b| 3|
# | 2| q| 3|
# | 2| a| 2|
# +----+------------+-----+Last updated
Was this helpful?