我有以下数据框。
df <- structure(list(Genecoverage = c(19.8511111111111, 10.1516966067864,
14.5631205673759, 7.25225225225225, 10.774011299435, 11.4794520547945,
17.7967032967033, 12.6770616770617, 14.1375, 13.2422422422422,
14.0379403794038, 11.4844006568145, 21.296875, 18.90625, 24.3293253173013
), Gene = c("k141_32902_11", "k141_32902_16", "k141_32902_22",
"k141_32902_23", "k141_32902_27", "k141_32902_28", "k141_32902_29",
"k141_32902_3", "k141_32902_30", "k141_32902_37", "k141_32902_38",
"k141_32902_5", "k141_3238_18", "k141_3238_3", "k141_3238_6"),
sample.x = c("10", "10", "10", "10", "10", "10", "10", "10",
"10", "10", "10", "10", "10", "10", "10"), Phylum = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Firmicutes", class = "factor"),
Class = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L), .Label = c("Bacilli", "Tissierellia"
), class = "factor"), Order = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Bacillales",
"Tissierellales"), class = "factor"), Family = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Peptoniphilaceae",
"Staphylococcaceae"), class = "factor"), Genus = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Peptoniphilus",
"Staphylococcus"), class = "factor"), Species = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "unknown", class = "factor"),
newgene = c("k141_32902", "k141_32902", "k141_32902", "k141_32902",
"k141_32902", "k141_32902", "k141_32902", "k141_32902", "k141_32902",
"k141_32902", "k141_32902", "k141_32902", "k141_3238", "k141_3238",
"k141_3238")), .Names = c("Genecoverage", "Gene", "sample.x",
"Phylum", "Class", "Order", "Family", "Genus", "Species", "newgene"
), row.names = c("42481", "42486", "42493", "42494", "42498",
"42499", "42500", "42501", "42502", "42509", "42510", "42512",
"41540", "41546", "41552"), class = "data.frame")
这将返回以下数据帧
Genecoverage Gene sample.x Phylum Class Order Family Genus Species newgene
42481 19.851111 k141_32902_11 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42486 10.151697 k141_32902_16 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42493 14.563121 k141_32902_22 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42494 7.252252 k141_32902_23 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42498 10.774011 k141_32902_27 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42499 11.479452 k141_32902_28 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42500 17.796703 k141_32902_29 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42501 12.677062 k141_32902_3 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42502 14.137500 k141_32902_30 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42509 13.242242 k141_32902_37 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42510 14.037940 k141_32902_38 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
42512 11.484401 k141_32902_5 10 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
41540 21.296875 k141_3238_18 10 Firmicutes Bacilli Bacillales Staphylococcaceae Staphylococcus unknown k141_3238
41546 18.906250 k141_3238_3 10 Firmicutes Bacilli Bacillales Staphylococcaceae Staphylococcus unknown k141_3238
41552 24.329325 k141_3238_6 10 Firmicutes Bacilli Bacillales Staphylococcaceae Staphylococcus unknown k141_3238
我想通过Gene计算中位数,并创建一个新的列,其中包含在聚合期间合并的新基因。
newdata <- data.frame(df%>%
group_by(Phylum,Class,Order,Family,Genus,Species,newgene )%>%
summarise_if(is.numeric, median))
这将返回以下内容
Phylum Class Order Family Genus Species newgene Genecoverage
1 Firmicutes Bacilli Bacillales Staphylococcaceae Staphylococcus unknown k141_3238 21.29688
2 Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902 12.95965
我在这里的问题是我丢失了基因列的信息。因此,例如,理想情况下,我想创建一个新列,折叠所有基因列名。例如,在新基因上面的输出数据框中k141_3238对应于合并的基因k141_32902_11、k141_32902_16、k141_32902_22…
这就是最终数据帧的外观:
Phylum Class Order Family Genus Species newgene
Firmicutes Bacilli Bacillales Staphylococcaceae Staphylococcus unknown k141_3238
Firmicutes Tissierellia Tissierellales Peptoniphilaceae Peptoniphilus unknown k141_32902
Genecoverage Concatenated_genes
21.3 k141_3238_18,k141_3238_3,k141_3238_6
13.0 k141_32902_11,k141_32902_16,k141_32902_22
我们可以将summarise_all
与用户定义的函数一起使用。如果列是数字,此函数将计算中位数,否则使用toString
连接所有字符串信息。
library(dplyr)
newdata <- df %>%
group_by(Phylum, Class, Order, Family, Genus, Species, newgene) %>%
select(-sample.x) %>%
summarise_all(funs(ifelse(is.numeric(.), median(.), toString(.)))) %>%
rename(Concatenated_genes = Gene)