Spark

import pyspark

def sort_and_replace_columns():
    """EXAMPLE, which creates new order (different order for different columns) and select only needed columns"""
    session = pyspark.sql.SparkSession.builder.config('spark.sql.shuffle.partitions', 4).getOrCreate()
    df = session.createDataFrame(
        [
            (1, 1, 'a'),
            (2, 3, 'b'),
            (1, 2, 'd'),
            (1, 7, 's'),
            (2, 3, 'q'),
            (1, 5, 'e'),
            (2, 2, 'a')
        ],
        ('user', 'score', 'other column'))
    df.show()
    # >>>
    # +----+-----+------------+
    # |user|score|other column|
    # +----+-----+------------+
    # |   1|    1|           a|
    # |   2|    3|           b|
    # |   1|    2|           d|
    # |   1|    7|           s|
    # |   2|    3|           q|
    # |   1|    5|           e|
    # |   2|    2|           a|
    # +----+-----+------------+

    ndf = df.orderBy(['user', 'score'], ascending=[1, 0])
    ndf.select('user', 'other column', 'score').show()
    # >>>
    # +----+------------+-----+
    # |user|other column|score|
    # +----+------------+-----+
    # |   1|           s|    7|
    # |   1|           e|    5|
    # |   1|           d|    2|
    # |   1|           a|    1|
    # |   2|           b|    3|
    # |   2|           q|    3|
    # |   2|           a|    2|
    # +----+------------+-----+

Last updated

Was this helpful?