So I have an Azure Data Factory ingestion that's using a Databricks notebook to parse illegal column name characters before saving it. This eventually put into a separate database.table. The code I decided on works, but is really inefficient....like five hours for an excel sheet that has over 350 columns. I need another approach that is more efficient to cut run times down.
#Replace illegal column names
for column in df.columns:
df = df.withColumnRenamed(column, column.lstrip())
for column in df.columns:
df = df.withColumnRenamed(column, column.rstrip())
for column in df.columns:
df = df.withColumnRenamed(column, column.replace(">", "greaterthan"))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("?", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("!", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("#", "number"))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("&", "and"))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("$", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("/", "_"))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("-", "_"))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace(",", "_"))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("(", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace(")", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("{", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("}", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("=", "equals"))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("\n", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("\t", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("'", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace(".", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("+", "plus"))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace(":", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("...", ""))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace(" ", "_"))
for column in df.columns:
df = df.withColumnRenamed(column, column.replace("__", ""))