Return to discussion

added 326 characters in body; deleted 5 characters in body

Source Link

edited Sep 19 at 8:43

6.7k
3
19
35

there are 819 non-pangrams and 49 perfect pangrams. there are 132 pangrams excluding the perfect pangrams. the number of pangrams could beare 181 if you include perfect pangrams as well considering they're "pangrams".

import pandas as pd
import pyspark.sql.functions as func

file_path = './drive/MyDrive/Copy of List of 1000 strings.json'

data_sdf = spark.createDataFrame(pd.read_json(file_path), ['strings'])

pre_process_sdf = data_sdf. \
    withColumn('strings_lower', func.lower('strings')). \
    withColumn('string_letter_split', 
               func.filter(func.split(func.regexp_replace('strings_lower', '[^a-z]', ''), ''), lambda x: x.isin('', ' ') == False)
               ). \
    withColumn('distinct_letters', func.array_distinct('string_letter_split')). \
    withColumn('actual_size', func.size('string_letter_split')). \
    withColumn('distinct_size', func.size('distinct_letters'))

pre_process_sdf. \
    withColumn('result''non_pangram',
               func.when(func.col('distinct_size') < 26, func).litcast('non-pangram''int')).
  \
    withColumn('pangram', (func.col('distinct_size') == 26).cast('int')). \
    withColumn('perfect_pangram', when((func.col('distinct_size''pangram') == 1) & (func.col('actual_size'), func== 26)).litcast('perfect pangram''int')). \
    select(func.sum('non_pangram').alias('non_pangrams'),
           otherwisefunc.sum('pangram').alias('pangrams'),
           func.sum('perfect_pangram').alias('perfect_pangrams'),
    ). \
    groupBy('result'). \
 func.sum(func.when(func.col('perfect_pangram') == 0, agg(func.countcol('strings''pangram'))).alias('string_count''pangrams_excl_perfects')
           ). \
    show()

# +------------+--------+----------------+----------------------+
# |         result|string_count||non_pangrams|pangrams|perfect_pangrams|pangrams_excl_perfects|
# +------------+--------+----------------+----------------------+
# |        pangram| 819|     181|   132|
# |    non-pangram|      49|   819|
# |perfect pangram|          49|    132|
# +------------+--------+----------------+----------------------+

there are 819 non-pangrams and 49 perfect pangrams. there are 132 pangrams excluding the perfect pangrams. the number of pangrams could be 181 if you include perfect pangrams as well considering they're "pangrams".

import pandas as pd
import pyspark.sql.functions as func

file_path = './drive/MyDrive/Copy of List of 1000 strings.json'

data_sdf = spark.createDataFrame(pd.read_json(file_path), ['strings'])

pre_process_sdf = data_sdf. \
    withColumn('strings_lower', func.lower('strings')). \
    withColumn('string_letter_split', 
               func.filter(func.split(func.regexp_replace('strings_lower', '[^a-z]', ''), ''), lambda x: x.isin('', ' ') == False)
               ). \
    withColumn('distinct_letters', func.array_distinct('string_letter_split')). \
    withColumn('actual_size', func.size('string_letter_split')). \
    withColumn('distinct_size', func.size('distinct_letters'))

pre_process_sdf. \
    withColumn('result',
               func.when(func.col('distinct_size') < 26, func.lit('non-pangram')).
               when(func.col('distinct_size') == func.col('actual_size'), func.lit('perfect pangram')).
               otherwise('pangram')
               ). \
    groupBy('result'). \
    agg(func.count('strings').alias('string_count')). \
    show()

# +---------------+------------+
# |         result|string_count|
# +---------------+------------+
# |        pangram|         132|
# |    non-pangram|         819|
# |perfect pangram|          49|
# +---------------+------------+

there are 819 non-pangrams and 49 perfect pangrams. there are 132 pangrams excluding the perfect pangrams. the number of pangrams are 181 if you include perfect pangrams as well considering they're "pangrams".

import pandas as pd
import pyspark.sql.functions as func

file_path = './drive/MyDrive/Copy of List of 1000 strings.json'

data_sdf = spark.createDataFrame(pd.read_json(file_path), ['strings'])

pre_process_sdf = data_sdf. \
    withColumn('strings_lower', func.lower('strings')). \
    withColumn('string_letter_split', 
               func.filter(func.split(func.regexp_replace('strings_lower', '[^a-z]', ''), ''), lambda x: x.isin('', ' ') == False)
               ). \
    withColumn('distinct_letters', func.array_distinct('string_letter_split')). \
    withColumn('actual_size', func.size('string_letter_split')). \
    withColumn('distinct_size', func.size('distinct_letters'))

pre_process_sdf. \
    withColumn('non_pangram', (func.col('distinct_size') < 26).cast('int')). \
    withColumn('pangram', (func.col('distinct_size') == 26).cast('int')). \
    withColumn('perfect_pangram', ((func.col('pangram') == 1) & (func.col('actual_size') == 26)).cast('int')). \
    select(func.sum('non_pangram').alias('non_pangrams'),
           func.sum('pangram').alias('pangrams'),
           func.sum('perfect_pangram').alias('perfect_pangrams'),
           func.sum(func.when(func.col('perfect_pangram') == 0, func.col('pangram'))).alias('pangrams_excl_perfects')
           ). \
    show()

# +------------+--------+----------------+----------------------+
# |non_pangrams|pangrams|perfect_pangrams|pangrams_excl_perfects|
# +------------+--------+----------------+----------------------+
# |         819|     181|              49|                   132|
# +------------+--------+----------------+----------------------+

Source Link

created Sep 19 at 8:17

samkart

6.7k
3
19
35

I did it in pyspark.

import pandas as pd
import pyspark.sql.functions as func

file_path = './drive/MyDrive/Copy of List of 1000 strings.json'

data_sdf = spark.createDataFrame(pd.read_json(file_path), ['strings'])

pre_process_sdf = data_sdf. \
    withColumn('strings_lower', func.lower('strings')). \
    withColumn('string_letter_split', 
               func.filter(func.split(func.regexp_replace('strings_lower', '[^a-z]', ''), ''), lambda x: x.isin('', ' ') == False)
               ). \
    withColumn('distinct_letters', func.array_distinct('string_letter_split')). \
    withColumn('actual_size', func.size('string_letter_split')). \
    withColumn('distinct_size', func.size('distinct_letters'))

pre_process_sdf. \
    withColumn('result',
               func.when(func.col('distinct_size') < 26, func.lit('non-pangram')).
               when(func.col('distinct_size') == func.col('actual_size'), func.lit('perfect pangram')).
               otherwise('pangram')
               ). \
    groupBy('result'). \
    agg(func.count('strings').alias('string_count')). \
    show()

# +---------------+------------+
# |         result|string_count|
# +---------------+------------+
# |        pangram|         132|
# |    non-pangram|         819|
# |perfect pangram|          49|
# +---------------+------------+

Collectives™ on Stack Overflow

Return to discussion