为啥标签与 pandas、itertools 和 numpy 索引不一致?
Posted
技术标签:
【中文标题】为啥标签与 pandas、itertools 和 numpy 索引不一致?【英文标题】:Why labels are not consistent with pandas, itertools, and numpy indexing?为什么标签与 pandas、itertools 和 numpy 索引不一致? 【发布时间】:2019-09-30 18:38:17 【问题描述】:我想不通。为什么在这种情况下group_A
的顺序很重要? Ar_groupA
和 Ar_groupB
被适当地索引并以正确的顺序调用。
我正在使用pandas v0.24.2
请帮助我理解为什么分组中的数据很重要。
from collections import *
import itertools
def pairwise_logfc(df_data, group_A, group_B):
# Init
X = df_data.copy()
attr_labels = X.columns
# Log Transform
df_log = np.log2(X)
# Groups
Ar_groupA = df_log.loc[group_A,:].values
Ar_groupB = df_log.loc[group_B,:].values
# Pairwise profiles
logfc_profiles = list()
for i in range(len(group_A)):
u = Ar_groupA[i,:]
for j in range(len(group_B)):
v = Ar_groupB[j,:]
logfc_profiles.append(v - u)
groups = (group_B, group_A)
labels = [*itertools.product(*groups)]
return pd.DataFrame(logfc_profiles, index=pd.MultiIndex.from_tuples(labels,names=["group_B", "group_A"]), columns=attr_labels)
# Load data
data = OrderedDict([('sepal_length', OrderedDict([('iris_0', 5.1), ('iris_1', 4.9), ('iris_10', 5.4), ('iris_100', 6.3), ('iris_101', 5.8), ('iris_102', 7.1), ('iris_103', 6.3), ('iris_104', 6.5), ('iris_105', 7.6), ('iris_106', 4.9), ('iris_107', 7.3), ('iris_108', 6.7), ('iris_109', 7.2), ('iris_11', 4.8), ('iris_110', 6.5), ('iris_111', 6.4), ('iris_112', 6.8), ('iris_113', 5.7), ('iris_114', 5.8), ('iris_115', 6.4), ('iris_116', 6.5), ('iris_117', 7.7), ('iris_118', 7.7), ('iris_119', 6.0), ('iris_12', 4.8), ('iris_120', 6.9), ('iris_121', 5.6), ('iris_122', 7.7), ('iris_123', 6.3), ('iris_124', 6.7), ('iris_125', 7.2), ('iris_126', 6.2), ('iris_127', 6.1), ('iris_128', 6.4), ('iris_129', 7.2), ('iris_13', 4.3), ('iris_130', 7.4), ('iris_131', 7.9), ('iris_132', 6.4), ('iris_133', 6.3), ('iris_134', 6.1), ('iris_135', 7.7), ('iris_136', 6.3), ('iris_137', 6.4), ('iris_138', 6.0), ('iris_139', 6.9), ('iris_14', 5.8), ('iris_140', 6.7), ('iris_141', 6.9), ('iris_142', 5.8), ('iris_143', 6.8), ('iris_144', 6.7), ('iris_145', 6.7), ('iris_146', 6.3), ('iris_147', 6.5), ('iris_148', 6.2), ('iris_149', 5.9), ('iris_15', 5.7), ('iris_16', 5.4), ('iris_17', 5.1), ('iris_18', 5.7), ('iris_19', 5.1), ('iris_2', 4.7), ('iris_20', 5.4), ('iris_21', 5.1), ('iris_22', 4.6), ('iris_23', 5.1), ('iris_24', 4.8), ('iris_25', 5.0), ('iris_26', 5.0), ('iris_27', 5.2), ('iris_28', 5.2), ('iris_29', 4.7), ('iris_3', 4.6), ('iris_30', 4.8), ('iris_31', 5.4), ('iris_32', 5.2), ('iris_33', 5.5), ('iris_34', 4.9), ('iris_35', 5.0), ('iris_36', 5.5), ('iris_37', 4.9), ('iris_38', 4.4), ('iris_39', 5.1), ('iris_4', 5.0), ('iris_40', 5.0), ('iris_41', 4.5), ('iris_42', 4.4), ('iris_43', 5.0), ('iris_44', 5.1), ('iris_45', 4.8), ('iris_46', 5.1), ('iris_47', 4.6), ('iris_48', 5.3), ('iris_49', 5.0), ('iris_5', 5.4), ('iris_50', 7.0), ('iris_51', 6.4), ('iris_52', 6.9), ('iris_53', 5.5), ('iris_54', 6.5), ('iris_55', 5.7), ('iris_56', 6.3), ('iris_57', 4.9), ('iris_58', 6.6), ('iris_59', 5.2), ('iris_6', 4.6), ('iris_60', 5.0), ('iris_61', 5.9), ('iris_62', 6.0), ('iris_63', 6.1), ('iris_64', 5.6), ('iris_65', 6.7), ('iris_66', 5.6), ('iris_67', 5.8), ('iris_68', 6.2), ('iris_69', 5.6), ('iris_7', 5.0), ('iris_70', 5.9), ('iris_71', 6.1), ('iris_72', 6.3), ('iris_73', 6.1), ('iris_74', 6.4), ('iris_75', 6.6), ('iris_76', 6.8), ('iris_77', 6.7), ('iris_78', 6.0), ('iris_79', 5.7), ('iris_8', 4.4), ('iris_80', 5.5), ('iris_81', 5.5), ('iris_82', 5.8), ('iris_83', 6.0), ('iris_84', 5.4), ('iris_85', 6.0), ('iris_86', 6.7), ('iris_87', 6.3), ('iris_88', 5.6), ('iris_89', 5.5), ('iris_9', 4.9), ('iris_90', 5.5), ('iris_91', 6.1), ('iris_92', 5.8), ('iris_93', 5.0), ('iris_94', 5.6), ('iris_95', 5.7), ('iris_96', 5.7), ('iris_97', 6.2), ('iris_98', 5.1), ('iris_99', 5.7)])), ('sepal_width', OrderedDict([('iris_0', 3.5), ('iris_1', 3.0), ('iris_10', 3.7), ('iris_100', 3.3), ('iris_101', 2.7), ('iris_102', 3.0), ('iris_103', 2.9), ('iris_104', 3.0), ('iris_105', 3.0), ('iris_106', 2.5), ('iris_107', 2.9), ('iris_108', 2.5), ('iris_109', 3.6), ('iris_11', 3.4), ('iris_110', 3.2), ('iris_111', 2.7), ('iris_112', 3.0), ('iris_113', 2.5), ('iris_114', 2.8), ('iris_115', 3.2), ('iris_116', 3.0), ('iris_117', 3.8), ('iris_118', 2.6), ('iris_119', 2.2), ('iris_12', 3.0), ('iris_120', 3.2), ('iris_121', 2.8), ('iris_122', 2.8), ('iris_123', 2.7), ('iris_124', 3.3), ('iris_125', 3.2), ('iris_126', 2.8), ('iris_127', 3.0), ('iris_128', 2.8), ('iris_129', 3.0), ('iris_13', 3.0), ('iris_130', 2.8), ('iris_131', 3.8), ('iris_132', 2.8), ('iris_133', 2.8), ('iris_134', 2.6), ('iris_135', 3.0), ('iris_136', 3.4), ('iris_137', 3.1), ('iris_138', 3.0), ('iris_139', 3.1), ('iris_14', 4.0), ('iris_140', 3.1), ('iris_141', 3.1), ('iris_142', 2.7), ('iris_143', 3.2), ('iris_144', 3.3), ('iris_145', 3.0), ('iris_146', 2.5), ('iris_147', 3.0), ('iris_148', 3.4), ('iris_149', 3.0), ('iris_15', 4.4), ('iris_16', 3.9), ('iris_17', 3.5), ('iris_18', 3.8), ('iris_19', 3.8), ('iris_2', 3.2), ('iris_20', 3.4), ('iris_21', 3.7), ('iris_22', 3.6), ('iris_23', 3.3), ('iris_24', 3.4), ('iris_25', 3.0), ('iris_26', 3.4), ('iris_27', 3.5), ('iris_28', 3.4), ('iris_29', 3.2), ('iris_3', 3.1), ('iris_30', 3.1), ('iris_31', 3.4), ('iris_32', 4.1), ('iris_33', 4.2), ('iris_34', 3.1), ('iris_35', 3.2), ('iris_36', 3.5), ('iris_37', 3.6), ('iris_38', 3.0), ('iris_39', 3.4), ('iris_4', 3.6), ('iris_40', 3.5), ('iris_41', 2.3), ('iris_42', 3.2), ('iris_43', 3.5), ('iris_44', 3.8), ('iris_45', 3.0), ('iris_46', 3.8), ('iris_47', 3.2), ('iris_48', 3.7), ('iris_49', 3.3), ('iris_5', 3.9), ('iris_50', 3.2), ('iris_51', 3.2), ('iris_52', 3.1), ('iris_53', 2.3), ('iris_54', 2.8), ('iris_55', 2.8), ('iris_56', 3.3), ('iris_57', 2.4), ('iris_58', 2.9), ('iris_59', 2.7), ('iris_6', 3.4), ('iris_60', 2.0), ('iris_61', 3.0), ('iris_62', 2.2), ('iris_63', 2.9), ('iris_64', 2.9), ('iris_65', 3.1), ('iris_66', 3.0), ('iris_67', 2.7), ('iris_68', 2.2), ('iris_69', 2.5), ('iris_7', 3.4), ('iris_70', 3.2), ('iris_71', 2.8), ('iris_72', 2.5), ('iris_73', 2.8), ('iris_74', 2.9), ('iris_75', 3.0), ('iris_76', 2.8), ('iris_77', 3.0), ('iris_78', 2.9), ('iris_79', 2.6), ('iris_8', 2.9), ('iris_80', 2.4), ('iris_81', 2.4), ('iris_82', 2.7), ('iris_83', 2.7), ('iris_84', 3.0), ('iris_85', 3.4), ('iris_86', 3.1), ('iris_87', 2.3), ('iris_88', 3.0), ('iris_89', 2.5), ('iris_9', 3.1), ('iris_90', 2.6), ('iris_91', 3.0), ('iris_92', 2.6), ('iris_93', 2.3), ('iris_94', 2.7), ('iris_95', 3.0), ('iris_96', 2.9), ('iris_97', 2.9), ('iris_98', 2.5), ('iris_99', 2.8)])), ('petal_length', OrderedDict([('iris_0', 1.4), ('iris_1', 1.4), ('iris_10', 1.5), ('iris_100', 6.0), ('iris_101', 5.1), ('iris_102', 5.9), ('iris_103', 5.6), ('iris_104', 5.8), ('iris_105', 6.6), ('iris_106', 4.5), ('iris_107', 6.3), ('iris_108', 5.8), ('iris_109', 6.1), ('iris_11', 1.6), ('iris_110', 5.1), ('iris_111', 5.3), ('iris_112', 5.5), ('iris_113', 5.0), ('iris_114', 5.1), ('iris_115', 5.3), ('iris_116', 5.5), ('iris_117', 6.7), ('iris_118', 6.9), ('iris_119', 5.0), ('iris_12', 1.4), ('iris_120', 5.7), ('iris_121', 4.9), ('iris_122', 6.7), ('iris_123', 4.9), ('iris_124', 5.7), ('iris_125', 6.0), ('iris_126', 4.8), ('iris_127', 4.9), ('iris_128', 5.6), ('iris_129', 5.8), ('iris_13', 1.1), ('iris_130', 6.1), ('iris_131', 6.4), ('iris_132', 5.6), ('iris_133', 5.1), ('iris_134', 5.6), ('iris_135', 6.1), ('iris_136', 5.6), ('iris_137', 5.5), ('iris_138', 4.8), ('iris_139', 5.4), ('iris_14', 1.2), ('iris_140', 5.6), ('iris_141', 5.1), ('iris_142', 5.1), ('iris_143', 5.9), ('iris_144', 5.7), ('iris_145', 5.2), ('iris_146', 5.0), ('iris_147', 5.2), ('iris_148', 5.4), ('iris_149', 5.1), ('iris_15', 1.5), ('iris_16', 1.3), ('iris_17', 1.4), ('iris_18', 1.7), ('iris_19', 1.5), ('iris_2', 1.3), ('iris_20', 1.7), ('iris_21', 1.5), ('iris_22', 1.0), ('iris_23', 1.7), ('iris_24', 1.9), ('iris_25', 1.6), ('iris_26', 1.6), ('iris_27', 1.5), ('iris_28', 1.4), ('iris_29', 1.6), ('iris_3', 1.5), ('iris_30', 1.6), ('iris_31', 1.5), ('iris_32', 1.5), ('iris_33', 1.4), ('iris_34', 1.5), ('iris_35', 1.2), ('iris_36', 1.3), ('iris_37', 1.4), ('iris_38', 1.3), ('iris_39', 1.5), ('iris_4', 1.4), ('iris_40', 1.3), ('iris_41', 1.3), ('iris_42', 1.3), ('iris_43', 1.6), ('iris_44', 1.9), ('iris_45', 1.4), ('iris_46', 1.6), ('iris_47', 1.4), ('iris_48', 1.5), ('iris_49', 1.4), ('iris_5', 1.7), ('iris_50', 4.7), ('iris_51', 4.5), ('iris_52', 4.9), ('iris_53', 4.0), ('iris_54', 4.6), ('iris_55', 4.5), ('iris_56', 4.7), ('iris_57', 3.3), ('iris_58', 4.6), ('iris_59', 3.9), ('iris_6', 1.4), ('iris_60', 3.5), ('iris_61', 4.2), ('iris_62', 4.0), ('iris_63', 4.7), ('iris_64', 3.6), ('iris_65', 4.4), ('iris_66', 4.5), ('iris_67', 4.1), ('iris_68', 4.5), ('iris_69', 3.9), ('iris_7', 1.5), ('iris_70', 4.8), ('iris_71', 4.0), ('iris_72', 4.9), ('iris_73', 4.7), ('iris_74', 4.3), ('iris_75', 4.4), ('iris_76', 4.8), ('iris_77', 5.0), ('iris_78', 4.5), ('iris_79', 3.5), ('iris_8', 1.4), ('iris_80', 3.8), ('iris_81', 3.7), ('iris_82', 3.9), ('iris_83', 5.1), ('iris_84', 4.5), ('iris_85', 4.5), ('iris_86', 4.7), ('iris_87', 4.4), ('iris_88', 4.1), ('iris_89', 4.0), ('iris_9', 1.5), ('iris_90', 4.4), ('iris_91', 4.6), ('iris_92', 4.0), ('iris_93', 3.3), ('iris_94', 4.2), ('iris_95', 4.2), ('iris_96', 4.2), ('iris_97', 4.3), ('iris_98', 3.0), ('iris_99', 4.1)])), ('petal_width', OrderedDict([('iris_0', 0.2), ('iris_1', 0.2), ('iris_10', 0.2), ('iris_100', 2.5), ('iris_101', 1.9), ('iris_102', 2.1), ('iris_103', 1.8), ('iris_104', 2.2), ('iris_105', 2.1), ('iris_106', 1.7), ('iris_107', 1.8), ('iris_108', 1.8), ('iris_109', 2.5), ('iris_11', 0.2), ('iris_110', 2.0), ('iris_111', 1.9), ('iris_112', 2.1), ('iris_113', 2.0), ('iris_114', 2.4), ('iris_115', 2.3), ('iris_116', 1.8), ('iris_117', 2.2), ('iris_118', 2.3), ('iris_119', 1.5), ('iris_12', 0.1), ('iris_120', 2.3), ('iris_121', 2.0), ('iris_122', 2.0), ('iris_123', 1.8), ('iris_124', 2.1), ('iris_125', 1.8), ('iris_126', 1.8), ('iris_127', 1.8), ('iris_128', 2.1), ('iris_129', 1.6), ('iris_13', 0.1), ('iris_130', 1.9), ('iris_131', 2.0), ('iris_132', 2.2), ('iris_133', 1.5), ('iris_134', 1.4), ('iris_135', 2.3), ('iris_136', 2.4), ('iris_137', 1.8), ('iris_138', 1.8), ('iris_139', 2.1), ('iris_14', 0.2), ('iris_140', 2.4), ('iris_141', 2.3), ('iris_142', 1.9), ('iris_143', 2.3), ('iris_144', 2.5), ('iris_145', 2.3), ('iris_146', 1.9), ('iris_147', 2.0), ('iris_148', 2.3), ('iris_149', 1.8), ('iris_15', 0.4), ('iris_16', 0.4), ('iris_17', 0.3), ('iris_18', 0.3), ('iris_19', 0.3), ('iris_2', 0.2), ('iris_20', 0.2), ('iris_21', 0.4), ('iris_22', 0.2), ('iris_23', 0.5), ('iris_24', 0.2), ('iris_25', 0.2), ('iris_26', 0.4), ('iris_27', 0.2), ('iris_28', 0.2), ('iris_29', 0.2), ('iris_3', 0.2), ('iris_30', 0.2), ('iris_31', 0.4), ('iris_32', 0.1), ('iris_33', 0.2), ('iris_34', 0.2), ('iris_35', 0.2), ('iris_36', 0.2), ('iris_37', 0.1), ('iris_38', 0.2), ('iris_39', 0.2), ('iris_4', 0.2), ('iris_40', 0.3), ('iris_41', 0.3), ('iris_42', 0.2), ('iris_43', 0.6), ('iris_44', 0.4), ('iris_45', 0.3), ('iris_46', 0.2), ('iris_47', 0.2), ('iris_48', 0.2), ('iris_49', 0.2), ('iris_5', 0.4), ('iris_50', 1.4), ('iris_51', 1.5), ('iris_52', 1.5), ('iris_53', 1.3), ('iris_54', 1.5), ('iris_55', 1.3), ('iris_56', 1.6), ('iris_57', 1.0), ('iris_58', 1.3), ('iris_59', 1.4), ('iris_6', 0.3), ('iris_60', 1.0), ('iris_61', 1.5), ('iris_62', 1.0), ('iris_63', 1.4), ('iris_64', 1.3), ('iris_65', 1.4), ('iris_66', 1.5), ('iris_67', 1.0), ('iris_68', 1.5), ('iris_69', 1.1), ('iris_7', 0.2), ('iris_70', 1.8), ('iris_71', 1.3), ('iris_72', 1.5), ('iris_73', 1.2), ('iris_74', 1.3), ('iris_75', 1.4), ('iris_76', 1.4), ('iris_77', 1.7), ('iris_78', 1.5), ('iris_79', 1.0), ('iris_8', 0.2), ('iris_80', 1.1), ('iris_81', 1.0), ('iris_82', 1.2), ('iris_83', 1.6), ('iris_84', 1.5), ('iris_85', 1.6), ('iris_86', 1.5), ('iris_87', 1.3), ('iris_88', 1.3), ('iris_89', 1.3), ('iris_9', 0.1), ('iris_90', 1.2), ('iris_91', 1.4), ('iris_92', 1.2), ('iris_93', 1.0), ('iris_94', 1.3), ('iris_95', 1.2), ('iris_96', 1.3), ('iris_97', 1.3), ('iris_98', 1.1), ('iris_99', 1.3)]))])
#
X_iris = pd.DataFrame(data)
idx_groupA = X_iris.index[:5]
idx_groupB = X_iris.index[-10:]
pairwise_logfc(X_iris, sorted(idx_groupA), idx_groupB).loc[("iris_90", "iris_0")]
# sepal_length 0.108934
# sepal_width -0.428843
# petal_length 1.652077
# petal_width 2.584963
# Name: (iris_90, iris_0), dtype: float64
pairwise_logfc(X_iris, sorted(idx_groupA)[::-1], idx_groupB).loc[("iris_90", "iris_0")]
# sepal_length -0.050626
# sepal_width 0.000000
# petal_length -0.280108
# petal_width -0.547488
# Name: (iris_90, iris_0), dtype: float64
【问题讨论】:
对不起,我还是有点糊涂。当我执行df_log.loc[group_A,:]
时,每个索引都应与group_A
相同,因此Ar_groupA[i,:]
应由group_A
的第i 个元素表示?
我期待它们与这条线的 b/c 相同:labels = [*itertools.product(*groups)]
。我认为group_A
和group_B
会正确组合。
我觉得我可能是沟通不畅或错过了一个关键的逻辑。 df_log
是一个pd.DataFrame
并且具有X
的原始索引。 df_log.loc[group_A,:].values
的顺序应该是 group_A
。 df_log.loc[group_A,:]
的 i_th
元素应该与 Ar_groupA
的第 i 个元素相同。两者对应的标签应为group_A[i]
。由于itertools.product
未排序并且依赖于两个输入数组(如预期的那样),因此无论我在将idx_groupA
提供给group_A
之前是否对它进行排序,都应该是相同的。正确的?谢谢。
【参考方案1】:
更新。完全摆脱“成对配置文件”中的循环。我们可以使用numpy
广播在一行中做减法:D。
import numpy as np
def pairwise_logfc(df_data, group_A, group_B):
# Init
X = df_data.copy()
attr_labels = X.columns
# Log Transform
df_log = np.log2(X)
# Groups
Ar_groupA = df_log.loc[group_A,:].values
Ar_groupB = df_log.loc[group_B,:].values
# Pairwise profiles
logfc_profiles = np.vstack(Ar_groupB[:, None] - Ar_groupA)
groups = (group_B, group_A)
labels = [*itertools.product(*groups)]
return pd.DataFrame(logfc_profiles, index=pd.MultiIndex.from_tuples(labels,names=["group_B", "group_A"]),
columns=attr_labels)
pairwise_logfc(X_iris, sorted(idx_groupA), idx_groupB).loc[("iris_90", "iris_0")]
#sepal_length 0.108934
#sepal_width -0.428843
#petal_length 1.652077
#petal_width 2.584963
#Name: (iris_90, iris_0), dtype: float64
pairwise_logfc(X_iris, sorted(idx_groupA)[::-1], idx_groupB).loc[("iris_90", "iris_0")]
#sepal_length 0.108934
#sepal_width -0.428843
#petal_length 1.652077
#petal_width 2.584963
#Name: (iris_90, iris_0), dtype: float64
原始修复:
您需要更改 Pairwise
配置文件逻辑的顺序。在您的原始文件中,外循环是 A,但是当您定义组时,它的 product([B, A])
,所以外循环需要是 B
。
# Pairwise profiles
logfc_profiles = list()
for i in range(len(group_B)):
u = Ar_groupB[i,:]
for j in range(len(group_A)):
v = Ar_groupA[j,:]
logfc_profiles.append(u - v)
【讨论】:
@O.rka 查看更新,了解减法的绝妙方法。 哇,在我的数据集上,np.new_axis
的挂墙时间从 14.5 秒下降到 4.96 秒。你能描述一下它是如何做到和我们之前用 for 循环做同样的事情的吗?
@O.rka,该命令只是None
的一个花哨别名,但结果是为您的数组添加了一个新轴。因此,如果它最初是 (10,4),它将变成 (10,1,4)。这允许我们在简单的减法 (10,1,4) - (5,4) 期间使用broadcasting,这样我们最终会得到一个向量化减法而不是循环。 This question 应该提供信息。 vstack
然后将 (10,5,4) 堆叠到 (50,4),2D,所以我们可以制作 DataFrame
以上是关于为啥标签与 pandas、itertools 和 numpy 索引不一致?的主要内容,如果未能解决你的问题,请参考以下文章
为啥 itertools.groupby 可以将 NaN 分组在列表中而不是 numpy 数组中
为啥 itertools groupby 不能按预期工作? [复制]
为啥 itertools.groupby() 不起作用? [复制]
在 Python 中,为啥 itertools.cycle 需要额外的内存? [复制]