Skip to main content
added 326 characters in body; deleted 5 characters in body
Source Link
samkart
  • 6.7k
  • 3
  • 19
  • 35

there are 819 non-pangrams and 49 perfect pangrams. there are 132 pangrams excluding the perfect pangrams. the number of pangrams could beare 181 if you include perfect pangrams as well considering they're "pangrams".

import pandas as pd
import pyspark.sql.functions as func

file_path = './drive/MyDrive/Copy of List of 1000 strings.json'

data_sdf = spark.createDataFrame(pd.read_json(file_path), ['strings'])

pre_process_sdf = data_sdf. \
    withColumn('strings_lower', func.lower('strings')). \
    withColumn('string_letter_split', 
               func.filter(func.split(func.regexp_replace('strings_lower', '[^a-z]', ''), ''), lambda x: x.isin('', ' ') == False)
               ). \
    withColumn('distinct_letters', func.array_distinct('string_letter_split')). \
    withColumn('actual_size', func.size('string_letter_split')). \
    withColumn('distinct_size', func.size('distinct_letters'))

pre_process_sdf. \
    withColumn('result''non_pangram',
               func.when(func.col('distinct_size') < 26, func).litcast('non-pangram''int')).
  \
    withColumn('pangram', (func.col('distinct_size') == 26).cast('int')). \
    withColumn('perfect_pangram', when((func.col('distinct_size''pangram') == 1) & (func.col('actual_size'), func== 26)).litcast('perfect pangram''int')). \
    select(func.sum('non_pangram').alias('non_pangrams'),
           otherwisefunc.sum('pangram').alias('pangrams'),
           func.sum('perfect_pangram').alias('perfect_pangrams'),
    ). \
    groupBy('result'). \
 func.sum(func.when(func.col('perfect_pangram') == 0, agg(func.countcol('strings''pangram'))).alias('string_count''pangrams_excl_perfects')
           ). \
    show()

# +------------+--------+----------------+----------------------+
# |         result|string_count||non_pangrams|pangrams|perfect_pangrams|pangrams_excl_perfects|
# +------------+--------+----------------+----------------------+
# |        pangram| 819|     181|   132|
# |    non-pangram|      49|   819|
# |perfect pangram|          49|    132|
# +------------+--------+----------------+----------------------+

there are 819 non-pangrams and 49 perfect pangrams. there are 132 pangrams excluding the perfect pangrams. the number of pangrams could be 181 if you include perfect pangrams as well considering they're "pangrams".

import pandas as pd
import pyspark.sql.functions as func

file_path = './drive/MyDrive/Copy of List of 1000 strings.json'

data_sdf = spark.createDataFrame(pd.read_json(file_path), ['strings'])

pre_process_sdf = data_sdf. \
    withColumn('strings_lower', func.lower('strings')). \
    withColumn('string_letter_split', 
               func.filter(func.split(func.regexp_replace('strings_lower', '[^a-z]', ''), ''), lambda x: x.isin('', ' ') == False)
               ). \
    withColumn('distinct_letters', func.array_distinct('string_letter_split')). \
    withColumn('actual_size', func.size('string_letter_split')). \
    withColumn('distinct_size', func.size('distinct_letters'))

pre_process_sdf. \
    withColumn('result',
               func.when(func.col('distinct_size') < 26, func.lit('non-pangram')).
               when(func.col('distinct_size') == func.col('actual_size'), func.lit('perfect pangram')).
               otherwise('pangram')
               ). \
    groupBy('result'). \
    agg(func.count('strings').alias('string_count')). \
    show()

# +---------------+------------+
# |         result|string_count|
# +---------------+------------+
# |        pangram|         132|
# |    non-pangram|         819|
# |perfect pangram|          49|
# +---------------+------------+

there are 819 non-pangrams and 49 perfect pangrams. there are 132 pangrams excluding the perfect pangrams. the number of pangrams are 181 if you include perfect pangrams as well considering they're "pangrams".

import pandas as pd
import pyspark.sql.functions as func

file_path = './drive/MyDrive/Copy of List of 1000 strings.json'

data_sdf = spark.createDataFrame(pd.read_json(file_path), ['strings'])

pre_process_sdf = data_sdf. \
    withColumn('strings_lower', func.lower('strings')). \
    withColumn('string_letter_split', 
               func.filter(func.split(func.regexp_replace('strings_lower', '[^a-z]', ''), ''), lambda x: x.isin('', ' ') == False)
               ). \
    withColumn('distinct_letters', func.array_distinct('string_letter_split')). \
    withColumn('actual_size', func.size('string_letter_split')). \
    withColumn('distinct_size', func.size('distinct_letters'))

pre_process_sdf. \
    withColumn('non_pangram', (func.col('distinct_size') < 26).cast('int')). \
    withColumn('pangram', (func.col('distinct_size') == 26).cast('int')). \
    withColumn('perfect_pangram', ((func.col('pangram') == 1) & (func.col('actual_size') == 26)).cast('int')). \
    select(func.sum('non_pangram').alias('non_pangrams'),
           func.sum('pangram').alias('pangrams'),
           func.sum('perfect_pangram').alias('perfect_pangrams'),
           func.sum(func.when(func.col('perfect_pangram') == 0, func.col('pangram'))).alias('pangrams_excl_perfects')
           ). \
    show()

# +------------+--------+----------------+----------------------+
# |non_pangrams|pangrams|perfect_pangrams|pangrams_excl_perfects|
# +------------+--------+----------------+----------------------+
# |         819|     181|              49|                   132|
# +------------+--------+----------------+----------------------+
Source Link
samkart
  • 6.7k
  • 3
  • 19
  • 35

I did it in pyspark.

there are 819 non-pangrams and 49 perfect pangrams. there are 132 pangrams excluding the perfect pangrams. the number of pangrams could be 181 if you include perfect pangrams as well considering they're "pangrams".

import pandas as pd
import pyspark.sql.functions as func

file_path = './drive/MyDrive/Copy of List of 1000 strings.json'

data_sdf = spark.createDataFrame(pd.read_json(file_path), ['strings'])

pre_process_sdf = data_sdf. \
    withColumn('strings_lower', func.lower('strings')). \
    withColumn('string_letter_split', 
               func.filter(func.split(func.regexp_replace('strings_lower', '[^a-z]', ''), ''), lambda x: x.isin('', ' ') == False)
               ). \
    withColumn('distinct_letters', func.array_distinct('string_letter_split')). \
    withColumn('actual_size', func.size('string_letter_split')). \
    withColumn('distinct_size', func.size('distinct_letters'))

pre_process_sdf. \
    withColumn('result',
               func.when(func.col('distinct_size') < 26, func.lit('non-pangram')).
               when(func.col('distinct_size') == func.col('actual_size'), func.lit('perfect pangram')).
               otherwise('pangram')
               ). \
    groupBy('result'). \
    agg(func.count('strings').alias('string_count')). \
    show()

# +---------------+------------+
# |         result|string_count|
# +---------------+------------+
# |        pangram|         132|
# |    non-pangram|         819|
# |perfect pangram|          49|
# +---------------+------------+