基于两个条件设置数据集，将每个数据帧保存到. csv文件中，迭代每个文件并绘制图形

提问者：小点点

基于两个条件设置数据集，将每个数据帧保存到. csv文件中，迭代每个文件并绘制图形

我是数据科学的新手，我需要帮助做以下事情：

（一）根据列中的唯一组和另一个组拆分数据集，在我的例子中是地区和国家

（二）我想将每个数据帧保存为一个。csv文件-类似以下内容regionname\u country。csv，例如，西部地区。csv，东波兰。csv

（三）如果可能的话，我想对循环进行迭代。csv文件，用于绘制每个df的教育与年龄的散点图。

（四）最后将我的绘图/图形保存在pdf文件中（每页4个图形）

'df'
   Region, country, Age, Education, Income, FICO, Target
1   west, GER, 43, 1, 47510, 710, 1
2   east, POL, 32, 2, 73640, 723, 1
3   east, POL, 22, 2, 88525, 610, 0
4   west, GER, 55, 0, 31008, 592, 0
5   north, USA, 19, 0, 18007, 599, 1
6   south, PER, 27, 2, 68850, 690, 0
7   south, BRZ, 56, 3, 71065, 592, 0
8   north, USA, 39, 1, 98004, 729, 1
9   east, JPN, 36, 2, 51361, 692, 0
10  west, ESP, 59, 1, 98643, 729, 1

预期结果：

 # df_to_csv : 'west_GER.csv'
west, GER, 43, 1, 47510, 710, 1 
west, GER, 55, 0, 31008, 592, 0

# west_ESP.csv
west, ESP, 59, 1, 98643, 729, 1

# east_POL.csv
east, POL, 32, 2, 73640, 723, 1

.
.
.

# north_USA.csv
north, USA, 39, 1, 98004, 729, 1  
north, USA, 19, 0, 18007, 599, 1

请参阅下面的代码

# using pandas 

# code for (I) and (II) not sure of my code but I think I need to nest through the for loop

for i, split_df in df.groupby('Region'):
     for j in df.groupby('country'): # not sure of the nested for loop
      split_df.to_csv(f'{i,j}.csv', index = False) # not sure of the {i,j} part

# code for (III) and (IV)

import glob
import numpy
import matplotlib.pyplot 
from matplotlib import pyplot as plot
from matplotlib.backends.backend_pdf import PdfPages


filenames = sorted(glob.glob('_*.csv')) # retrieving all files containing '_' since we have region_country.csv
filenames = filenames[0:len(filenames)]
for filename in filenames:
    print(filename)

    data = numpy.loadtxt(fname=filename, delimiter=',')
    # The PDF document
    pdf_pages = PdfPages('plots.pdf')
    fig, ax = plt.subplots()    # create a figure

    # Generate the pages
    nb_plots = data.shape[0]
    nb_plots_per_page = 4
    nb_pages = int(numpy.ceil(nb_plots / float(nb_plots_per_page)))
    grid_size = (nb_plots_per_page, 1)
    for i, samples in enumerate(data):
    # Create a figure instance (ie. a new page) if needed
      if i % nb_plots_per_page == 0:
      fig = plot.figure(figsize=(8.5, 12), dpi=125)

    # plot stuff 
      x = data[:,2]  # age column
      y = data[:,3] # education column

     ax.plot(x, y,color = colorlist[i])

     ax.set_xscale("log")
     ax.set_xlabel("x")
     ax.set_ylabel("y")

     plt.show()
    # Close the page if needed
    if (i + 1) % nb_plots_per_page == 0 or (i + 1) == nb_plots:
    plot.tight_layout()
    pdf_pages.savefig(fig)
 
    # Write the PDF document to the disk
    pdf_pages.close()

任何帮助都将不胜感激，我对python和R都持开放态度。提前谢谢。


#Attempt for PCA 


import glob
import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=2, ncols=2)
for ax, file in zip(axs.flatten(), glob.glob("./*csv")):
    df_temp = pd.read_csv(file) # read each csv file
    df_temp.drop('Unnamed: 0', axis=1, inplace=True) # drop the index number columns
    df_temp = df_temp.dropna() # drop NaNs

    X = df_temp.iloc[:,4:len(df_temp.columns)]#.astype(float) # select the 5th columns to the end 
    y = df_temp.iloc[:,0] # the first column is the label column
    # PCA starts from here
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    pca = PCA(n_components=2)
    pca.fit(X)
    x_pca = pca.transform(X)
     # I want to convert the x_pca array in dataframe for easier plotting
    data = pd.DataFrame({'PC1': x_pca[:, 0], 'PC2': x_pca[:, 1]})
    PC1_temp = data['PC1'][0]
    PC2_temp = data['PC2'][0]
    categories = y # label column to be used for distinguish the two classes
    colormap = np.array(['r', 'g']) # desired color red and green for the two distinct classes in the label column
    ax.scatter(x_pca[:,0], x_pca[,:1],c=colormap[categories])
    ax.set_title(f"PC1:{PC1_temp}, P2:{PC2_temp}")
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    plt.tight_layout()
    plt.legend()# Also, I want to include a legend to show the 'r', 'g' values of the two distinct classes of label column
fig.savefig("scatter.pdf")

```




             共2个答案


                        

                
                    匿名用户

                




                
					
对于Python：
（I）
for i in df.groupby(["Region", "country"])[["Region", "country"]].apply(lambda x: list(np.unique(x))):
    df.groupby(["Region", "country"]).get_group((i[1], i[0])).to_csv(f"{i[1]}_{i[0]}.csv")
(III)
import glob
import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=2, ncols=2)
for ax, file in zip(axs.flatten(), glob.glob("./*csv")):
    df_temp = pd.read_csv(file)
    region_temp = df_temp['Region'][0]
    country_temp = df_temp['country'][0]    
    ax.scatter(df_temp["Age"], df_temp["Education"])
    ax.set_title(f"Region:{region_temp}, Country:{country_temp}")
    ax.set_xlabel("Age")
    ax.set_ylabel("Education")
    plt.tight_layout()
fig.savefig("scatter.pdf")
				

                
                
            

            
                        

                
                    匿名用户

                




                
					
在R中，您可以这样做：
library(tidyverse)

#get data in list of dataframes
df %>%
  select(Region, country, Education, Age) %>%
  group_split(Region, country) -> split_data

#From list of data create list of plots. 
list_plots <- map(split_data, ~ggplot(.) + aes(Education, Age) + 
                geom_point() + 
                 ggtitle(sprintf('Plot for region %s and country %s', 
                 first(.$Region), first(.$country))))

#Write the plots in pdf as well as write the csvs.
pdf("plots.pdf", onefile = TRUE)
for (i in seq_along(list_plots)) {
  write.csv(split_data, sprintf('%s_%s.csv', 
      split_data[[i]]$Region[1], split_data[[i]]$country[1]), row.names = FALSE)
  print(list_plots[[i]]) 
}
dev.off()


		      
                相关问题
                

																                
					
										   Android：在模块jefied-play-services-测量和jefied-play-services-测量-impl中发现重复类
										   在Hashmap中查找匹配的键/值对
										   如何迭代Hashmap并与同一Hashmap中的其他键进行组合以比较它们的对象
										   HashCode-如果相等的对象碰巧在同一个桶中散列会发生什么？
										   如何防止对数组中类对象的重复引用？
										   JavaHashMap内部数据结构在重新散列期间如何变化？
										   如何以及何时在HashMap中完成重新散列
										   在hashmap或hashtable中重新散列的成本
										   HashMap如何识别内部数组中的哪些位置包含元素？
										   当HashMap增加其大小时，HashMap中值的索引会发生什么？
										   @BeforeClass在ktor测试类中不工作
										   Jest vanilla JavaScript JSDOM刷新失败，切换beforeAll到before每一个后的第二次测试中断
										   在笑话中，定义全局变量是否与在BeforeAll中定义相同？
										   静态编程语言中@BeforeAll的正确解决方法是什么
										   线程“main”java. lang.NoClassDefFoundError中的异常：在Intellij[Spring boot]中
										   线程“main”java. lang.NoClassDefFoundError中的异常：org/apache/log4j/ProvisionNode
										   异步管道是否从服务中定义并从组件变量指向的可观察对象取消订阅？
										   结合主体时不更新在模板中的异步管道可观察
										   组件中的Angular 2重复订阅
										   共享可观察数据的正确方式和地点是什么

基于两个条件设置数据集，将每个数据帧保存到. csv文件中，迭代每个文件并绘制图形

共2个答案

相关问题

热门标签

微信关注