提问者:小点点

基于两个条件设置数据集,将每个数据帧保存到. csv文件中,迭代每个文件并绘制图形


我是数据科学的新手,我需要帮助做以下事情:

(一) 根据列中的唯一组和另一个组拆分数据集,在我的例子中是地区国家

(二) 我想将每个数据帧保存为一个。csv文件-类似以下内容regionname\u country。csv,例如,西部地区。csv,东波兰。csv

(三) 如果可能的话,我想对循环进行迭代。csv文件,用于绘制每个df的教育与年龄的散点图。

(四) 最后将我的绘图/图形保存在pdf文件中(每页4个图形)

'df'
   Region, country, Age, Education, Income, FICO, Target
1   west, GER, 43, 1, 47510, 710, 1
2   east, POL, 32, 2, 73640, 723, 1
3   east, POL, 22, 2, 88525, 610, 0
4   west, GER, 55, 0, 31008, 592, 0
5   north, USA, 19, 0, 18007, 599, 1
6   south, PER, 27, 2, 68850, 690, 0
7   south, BRZ, 56, 3, 71065, 592, 0
8   north, USA, 39, 1, 98004, 729, 1
9   east, JPN, 36, 2, 51361, 692, 0
10  west, ESP, 59, 1, 98643, 729, 1

预期结果:

 # df_to_csv : 'west_GER.csv'
west, GER, 43, 1, 47510, 710, 1 
west, GER, 55, 0, 31008, 592, 0 
# west_ESP.csv
west, ESP, 59, 1, 98643, 729, 1 
# east_POL.csv
east, POL, 32, 2, 73640, 723, 1 

.
.
.

# north_USA.csv
north, USA, 39, 1, 98004, 729, 1  
north, USA, 19, 0, 18007, 599, 1

请参阅下面的代码

# using pandas 

# code for (I) and (II) not sure of my code but I think I need to nest through the for loop

for i, split_df in df.groupby('Region'):
     for j in df.groupby('country'): # not sure of the nested for loop
      split_df.to_csv(f'{i,j}.csv', index = False) # not sure of the {i,j} part

# code for (III) and (IV)

import glob
import numpy
import matplotlib.pyplot 
from matplotlib import pyplot as plot
from matplotlib.backends.backend_pdf import PdfPages


filenames = sorted(glob.glob('_*.csv')) # retrieving all files containing '_' since we have region_country.csv
filenames = filenames[0:len(filenames)]
for filename in filenames:
    print(filename)

    data = numpy.loadtxt(fname=filename, delimiter=',')
    # The PDF document
    pdf_pages = PdfPages('plots.pdf')
    fig, ax = plt.subplots()    # create a figure

    # Generate the pages
    nb_plots = data.shape[0]
    nb_plots_per_page = 4
    nb_pages = int(numpy.ceil(nb_plots / float(nb_plots_per_page)))
    grid_size = (nb_plots_per_page, 1)
    for i, samples in enumerate(data):
    # Create a figure instance (ie. a new page) if needed
      if i % nb_plots_per_page == 0:
      fig = plot.figure(figsize=(8.5, 12), dpi=125)

    # plot stuff 
      x = data[:,2]  # age column
      y = data[:,3] # education column

     ax.plot(x, y,color = colorlist[i])

     ax.set_xscale("log")
     ax.set_xlabel("x")
     ax.set_ylabel("y")

     plt.show()
    # Close the page if needed
    if (i + 1) % nb_plots_per_page == 0 or (i + 1) == nb_plots:
    plot.tight_layout()
    pdf_pages.savefig(fig)
 
    # Write the PDF document to the disk
    pdf_pages.close()

任何帮助都将不胜感激,我对python和R都持开放态度。提前谢谢。


#Attempt for PCA 


import glob
import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=2, ncols=2)
for ax, file in zip(axs.flatten(), glob.glob("./*csv")):
    df_temp = pd.read_csv(file) # read each csv file
    df_temp.drop('Unnamed: 0', axis=1, inplace=True) # drop the index number columns
    df_temp = df_temp.dropna() # drop NaNs

    X = df_temp.iloc[:,4:len(df_temp.columns)]#.astype(float) # select the 5th columns to the end 
    y = df_temp.iloc[:,0] # the first column is the label column
    # PCA starts from here
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    pca = PCA(n_components=2)
    pca.fit(X)
    x_pca = pca.transform(X)
     # I want to convert the x_pca array in dataframe for easier plotting
    data = pd.DataFrame({'PC1': x_pca[:, 0], 'PC2': x_pca[:, 1]})
    PC1_temp = data['PC1'][0]
    PC2_temp = data['PC2'][0]
    categories = y # label column to be used for distinguish the two classes
    colormap = np.array(['r', 'g']) # desired color red and green for the two distinct classes in the label column
    ax.scatter(x_pca[:,0], x_pca[,:1],c=colormap[categories])
    ax.set_title(f"PC1:{PC1_temp}, P2:{PC2_temp}")
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    plt.tight_layout()
    plt.legend()# Also, I want to include a legend to show the 'r', 'g' values of the two distinct classes of label column
fig.savefig("scatter.pdf")

```

共2个答案

匿名用户

对于Python:
(I)

for i in df.groupby(["Region", "country"])[["Region", "country"]].apply(lambda x: list(np.unique(x))):
    df.groupby(["Region", "country"]).get_group((i[1], i[0])).to_csv(f"{i[1]}_{i[0]}.csv")

(III)

import glob
import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=2, ncols=2)
for ax, file in zip(axs.flatten(), glob.glob("./*csv")):
    df_temp = pd.read_csv(file)
    region_temp = df_temp['Region'][0]
    country_temp = df_temp['country'][0]    
    ax.scatter(df_temp["Age"], df_temp["Education"])
    ax.set_title(f"Region:{region_temp}, Country:{country_temp}")
    ax.set_xlabel("Age")
    ax.set_ylabel("Education")
    plt.tight_layout()
fig.savefig("scatter.pdf")

匿名用户

在R中,您可以这样做:

library(tidyverse)

#get data in list of dataframes
df %>%
  select(Region, country, Education, Age) %>%
  group_split(Region, country) -> split_data

#From list of data create list of plots. 
list_plots <- map(split_data, ~ggplot(.) + aes(Education, Age) + 
                geom_point() + 
                 ggtitle(sprintf('Plot for region %s and country %s', 
                 first(.$Region), first(.$country))))

#Write the plots in pdf as well as write the csvs.
pdf("plots.pdf", onefile = TRUE)
for (i in seq_along(list_plots)) {
  write.csv(split_data, sprintf('%s_%s.csv', 
      split_data[[i]]$Region[1], split_data[[i]]$country[1]), row.names = FALSE)
  print(list_plots[[i]]) 
}
dev.off()