我是数据科学的新手,我需要帮助做以下事情:
(一) 根据列中的唯一组和另一个组拆分数据集,在我的例子中是地区
和国家
(二) 我想将每个数据帧保存为一个。csv文件-类似以下内容regionname\u country。csv
,例如,西部地区。csv,东波兰。csv
(三) 如果可能的话,我想对循环进行迭代。csv文件,用于绘制每个df的教育与年龄的散点图。
(四) 最后将我的绘图/图形保存在pdf文件中(每页4个图形)
'df'
Region, country, Age, Education, Income, FICO, Target
1 west, GER, 43, 1, 47510, 710, 1
2 east, POL, 32, 2, 73640, 723, 1
3 east, POL, 22, 2, 88525, 610, 0
4 west, GER, 55, 0, 31008, 592, 0
5 north, USA, 19, 0, 18007, 599, 1
6 south, PER, 27, 2, 68850, 690, 0
7 south, BRZ, 56, 3, 71065, 592, 0
8 north, USA, 39, 1, 98004, 729, 1
9 east, JPN, 36, 2, 51361, 692, 0
10 west, ESP, 59, 1, 98643, 729, 1
预期结果:
# df_to_csv : 'west_GER.csv'
west, GER, 43, 1, 47510, 710, 1
west, GER, 55, 0, 31008, 592, 0
# west_ESP.csv
west, ESP, 59, 1, 98643, 729, 1
# east_POL.csv
east, POL, 32, 2, 73640, 723, 1
.
.
.
# north_USA.csv
north, USA, 39, 1, 98004, 729, 1
north, USA, 19, 0, 18007, 599, 1
请参阅下面的代码
# using pandas
# code for (I) and (II) not sure of my code but I think I need to nest through the for loop
for i, split_df in df.groupby('Region'):
for j in df.groupby('country'): # not sure of the nested for loop
split_df.to_csv(f'{i,j}.csv', index = False) # not sure of the {i,j} part
# code for (III) and (IV)
import glob
import numpy
import matplotlib.pyplot
from matplotlib import pyplot as plot
from matplotlib.backends.backend_pdf import PdfPages
filenames = sorted(glob.glob('_*.csv')) # retrieving all files containing '_' since we have region_country.csv
filenames = filenames[0:len(filenames)]
for filename in filenames:
print(filename)
data = numpy.loadtxt(fname=filename, delimiter=',')
# The PDF document
pdf_pages = PdfPages('plots.pdf')
fig, ax = plt.subplots() # create a figure
# Generate the pages
nb_plots = data.shape[0]
nb_plots_per_page = 4
nb_pages = int(numpy.ceil(nb_plots / float(nb_plots_per_page)))
grid_size = (nb_plots_per_page, 1)
for i, samples in enumerate(data):
# Create a figure instance (ie. a new page) if needed
if i % nb_plots_per_page == 0:
fig = plot.figure(figsize=(8.5, 12), dpi=125)
# plot stuff
x = data[:,2] # age column
y = data[:,3] # education column
ax.plot(x, y,color = colorlist[i])
ax.set_xscale("log")
ax.set_xlabel("x")
ax.set_ylabel("y")
plt.show()
# Close the page if needed
if (i + 1) % nb_plots_per_page == 0 or (i + 1) == nb_plots:
plot.tight_layout()
pdf_pages.savefig(fig)
# Write the PDF document to the disk
pdf_pages.close()
任何帮助都将不胜感激,我对python和R都持开放态度。提前谢谢。
#Attempt for PCA
import glob
import matplotlib.pyplot as plt
fig, axs = plt.subplots(nrows=2, ncols=2)
for ax, file in zip(axs.flatten(), glob.glob("./*csv")):
df_temp = pd.read_csv(file) # read each csv file
df_temp.drop('Unnamed: 0', axis=1, inplace=True) # drop the index number columns
df_temp = df_temp.dropna() # drop NaNs
X = df_temp.iloc[:,4:len(df_temp.columns)]#.astype(float) # select the 5th columns to the end
y = df_temp.iloc[:,0] # the first column is the label column
# PCA starts from here
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pca = PCA(n_components=2)
pca.fit(X)
x_pca = pca.transform(X)
# I want to convert the x_pca array in dataframe for easier plotting
data = pd.DataFrame({'PC1': x_pca[:, 0], 'PC2': x_pca[:, 1]})
PC1_temp = data['PC1'][0]
PC2_temp = data['PC2'][0]
categories = y # label column to be used for distinguish the two classes
colormap = np.array(['r', 'g']) # desired color red and green for the two distinct classes in the label column
ax.scatter(x_pca[:,0], x_pca[,:1],c=colormap[categories])
ax.set_title(f"PC1:{PC1_temp}, P2:{PC2_temp}")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
plt.tight_layout()
plt.legend()# Also, I want to include a legend to show the 'r', 'g' values of the two distinct classes of label column
fig.savefig("scatter.pdf")
```
对于Python:
(I)
for i in df.groupby(["Region", "country"])[["Region", "country"]].apply(lambda x: list(np.unique(x))):
df.groupby(["Region", "country"]).get_group((i[1], i[0])).to_csv(f"{i[1]}_{i[0]}.csv")
(III)
import glob
import matplotlib.pyplot as plt
fig, axs = plt.subplots(nrows=2, ncols=2)
for ax, file in zip(axs.flatten(), glob.glob("./*csv")):
df_temp = pd.read_csv(file)
region_temp = df_temp['Region'][0]
country_temp = df_temp['country'][0]
ax.scatter(df_temp["Age"], df_temp["Education"])
ax.set_title(f"Region:{region_temp}, Country:{country_temp}")
ax.set_xlabel("Age")
ax.set_ylabel("Education")
plt.tight_layout()
fig.savefig("scatter.pdf")
在R中,您可以这样做:
library(tidyverse)
#get data in list of dataframes
df %>%
select(Region, country, Education, Age) %>%
group_split(Region, country) -> split_data
#From list of data create list of plots.
list_plots <- map(split_data, ~ggplot(.) + aes(Education, Age) +
geom_point() +
ggtitle(sprintf('Plot for region %s and country %s',
first(.$Region), first(.$country))))
#Write the plots in pdf as well as write the csvs.
pdf("plots.pdf", onefile = TRUE)
for (i in seq_along(list_plots)) {
write.csv(split_data, sprintf('%s_%s.csv',
split_data[[i]]$Region[1], split_data[[i]]$country[1]), row.names = FALSE)
print(list_plots[[i]])
}
dev.off()