I am new to data science and I need help doing the following:
(I) splitting a dataset based on unique groups in column and another group, in my case region and country
(II) I will like to save each dataframe as a .csv file- something like this regionname_country.csv, for example, west_GER.csv, east_POL.csv
(III) if possible, I will like to iterate for loop through each .csv file to plot a scatterplot of education vs age for each df.
(IV) Lastly save my plots/figures in a pdf file (4 figures per page)
'df'
Region, country, Age, Education, Income, FICO, Target
1 west, GER, 43, 1, 47510, 710, 1
2 east, POL, 32, 2, 73640, 723, 1
3 east, POL, 22, 2, 88525, 610, 0
4 west, GER, 55, 0, 31008, 592, 0
5 north, USA, 19, 0, 18007, 599, 1
6 south, PER, 27, 2, 68850, 690, 0
7 south, BRZ, 56, 3, 71065, 592, 0
8 north, USA, 39, 1, 98004, 729, 1
9 east, JPN, 36, 2, 51361, 692, 0
10 west, ESP, 59, 1, 98643, 729, 1
Desired outcome:
# df_to_csv : 'west_GER.csv'
west, GER, 43, 1, 47510, 710, 1
west, GER, 55, 0, 31008, 592, 0
# west_ESP.csv
west, ESP, 59, 1, 98643, 729, 1
# east_POL.csv
east, POL, 32, 2, 73640, 723, 1
.
.
.
# north_USA.csv
north, USA, 39, 1, 98004, 729, 1
north, USA, 19, 0, 18007, 599, 1
See below for my code
# using pandas
# code for (I) and (II) not sure of my code but I think I need to nest through the for loop
for i, split_df in df.groupby('Region'):
for j in df.groupby('country'): # not sure of the nested for loop
split_df.to_csv(f'{i,j}.csv', index = False) # not sure of the {i,j} part
# code for (III) and (IV)
import glob
import numpy
import matplotlib.pyplot
from matplotlib import pyplot as plot
from matplotlib.backends.backend_pdf import PdfPages
filenames = sorted(glob.glob('_*.csv')) # retrieving all files containing '_' since we have region_country.csv
filenames = filenames[0:len(filenames)]
for filename in filenames:
print(filename)
data = numpy.loadtxt(fname=filename, delimiter=',')
# The PDF document
pdf_pages = PdfPages('plots.pdf')
fig, ax = plt.subplots() # create a figure
# Generate the pages
nb_plots = data.shape[0]
nb_plots_per_page = 4
nb_pages = int(numpy.ceil(nb_plots / float(nb_plots_per_page)))
grid_size = (nb_plots_per_page, 1)
for i, samples in enumerate(data):
# Create a figure instance (ie. a new page) if needed
if i % nb_plots_per_page == 0:
fig = plot.figure(figsize=(8.5, 12), dpi=125)
# plot stuff
x = data[:,2] # age column
y = data[:,3] # education column
ax.plot(x, y,color = colorlist[i])
ax.set_xscale("log")
ax.set_xlabel("x")
ax.set_ylabel("y")
plt.show()
# Close the page if needed
if (i + 1) % nb_plots_per_page == 0 or (i + 1) == nb_plots:
plot.tight_layout()
pdf_pages.savefig(fig)
# Write the PDF document to the disk
pdf_pages.close()
Any assistance will be much appreciated, I am open to both python and R. Thank you in advance.
#Attempt for PCA
import glob
import matplotlib.pyplot as plt
fig, axs = plt.subplots(nrows=2, ncols=2)
for ax, file in zip(axs.flatten(), glob.glob("./*csv")):
df_temp = pd.read_csv(file) # read each csv file
df_temp.drop('Unnamed: 0', axis=1, inplace=True) # drop the index number columns
df_temp = df_temp.dropna() # drop NaNs
X = df_temp.iloc[:,4:len(df_temp.columns)]#.astype(float) # select the 5th columns to the end
y = df_temp.iloc[:,0] # the first column is the label column
# PCA starts from here
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pca = PCA(n_components=2)
pca.fit(X)
x_pca = pca.transform(X)
# I want to convert the x_pca array in dataframe for easier plotting
data = pd.DataFrame({'PC1': x_pca[:, 0], 'PC2': x_pca[:, 1]})
PC1_temp = data['PC1'][0]
PC2_temp = data['PC2'][0]
categories = y # label column to be used for distinguish the two classes
colormap = np.array(['r', 'g']) # desired color red and green for the two distinct classes in the label column
ax.scatter(x_pca[:,0], x_pca[,:1],c=colormap[categories])
ax.set_title(f"PC1:{PC1_temp}, P2:{PC2_temp}")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
plt.tight_layout()
plt.legend()# Also, I want to include a legend to show the 'r', 'g' values of the two distinct classes of label column
fig.savefig("scatter.pdf")
```