为啥标签与 pandas、itertools 和 numpy 索引不一致?

Posted

技术标签:

【中文标题】为啥标签与 pandas、itertools 和 numpy 索引不一致?【英文标题】:Why labels are not consistent with pandas, itertools, and numpy indexing?为什么标签与 pandas、itertools 和 numpy 索引不一致? 【发布时间】:2019-09-30 18:38:17 【问题描述】:

我想不通。为什么在这种情况下group_A 的顺序很重要? Ar_groupAAr_groupB 被适当地索引并以正确的顺序调用。

我正在使用pandas v0.24.2

请帮助我理解为什么分组中的数据很重要。

from collections import *
import itertools

def pairwise_logfc(df_data, group_A, group_B):
    # Init
    X = df_data.copy()
    attr_labels = X.columns

    # Log Transform
    df_log = np.log2(X)

    # Groups
    Ar_groupA = df_log.loc[group_A,:].values
    Ar_groupB = df_log.loc[group_B,:].values

    # Pairwise profiles
    logfc_profiles = list()
    for i in  range(len(group_A)):
        u = Ar_groupA[i,:]
        for j in range(len(group_B)):
            v = Ar_groupB[j,:]
            logfc_profiles.append(v - u)

    groups = (group_B, group_A)
    labels = [*itertools.product(*groups)]
    return pd.DataFrame(logfc_profiles, index=pd.MultiIndex.from_tuples(labels,names=["group_B", "group_A"]), columns=attr_labels)

# Load data
data = OrderedDict([('sepal_length', OrderedDict([('iris_0', 5.1), ('iris_1', 4.9), ('iris_10', 5.4), ('iris_100', 6.3), ('iris_101', 5.8), ('iris_102', 7.1), ('iris_103', 6.3), ('iris_104', 6.5), ('iris_105', 7.6), ('iris_106', 4.9), ('iris_107', 7.3), ('iris_108', 6.7), ('iris_109', 7.2), ('iris_11', 4.8), ('iris_110', 6.5), ('iris_111', 6.4), ('iris_112', 6.8), ('iris_113', 5.7), ('iris_114', 5.8), ('iris_115', 6.4), ('iris_116', 6.5), ('iris_117', 7.7), ('iris_118', 7.7), ('iris_119', 6.0), ('iris_12', 4.8), ('iris_120', 6.9), ('iris_121', 5.6), ('iris_122', 7.7), ('iris_123', 6.3), ('iris_124', 6.7), ('iris_125', 7.2), ('iris_126', 6.2), ('iris_127', 6.1), ('iris_128', 6.4), ('iris_129', 7.2), ('iris_13', 4.3), ('iris_130', 7.4), ('iris_131', 7.9), ('iris_132', 6.4), ('iris_133', 6.3), ('iris_134', 6.1), ('iris_135', 7.7), ('iris_136', 6.3), ('iris_137', 6.4), ('iris_138', 6.0), ('iris_139', 6.9), ('iris_14', 5.8), ('iris_140', 6.7), ('iris_141', 6.9), ('iris_142', 5.8), ('iris_143', 6.8), ('iris_144', 6.7), ('iris_145', 6.7), ('iris_146', 6.3), ('iris_147', 6.5), ('iris_148', 6.2), ('iris_149', 5.9), ('iris_15', 5.7), ('iris_16', 5.4), ('iris_17', 5.1), ('iris_18', 5.7), ('iris_19', 5.1), ('iris_2', 4.7), ('iris_20', 5.4), ('iris_21', 5.1), ('iris_22', 4.6), ('iris_23', 5.1), ('iris_24', 4.8), ('iris_25', 5.0), ('iris_26', 5.0), ('iris_27', 5.2), ('iris_28', 5.2), ('iris_29', 4.7), ('iris_3', 4.6), ('iris_30', 4.8), ('iris_31', 5.4), ('iris_32', 5.2), ('iris_33', 5.5), ('iris_34', 4.9), ('iris_35', 5.0), ('iris_36', 5.5), ('iris_37', 4.9), ('iris_38', 4.4), ('iris_39', 5.1), ('iris_4', 5.0), ('iris_40', 5.0), ('iris_41', 4.5), ('iris_42', 4.4), ('iris_43', 5.0), ('iris_44', 5.1), ('iris_45', 4.8), ('iris_46', 5.1), ('iris_47', 4.6), ('iris_48', 5.3), ('iris_49', 5.0), ('iris_5', 5.4), ('iris_50', 7.0), ('iris_51', 6.4), ('iris_52', 6.9), ('iris_53', 5.5), ('iris_54', 6.5), ('iris_55', 5.7), ('iris_56', 6.3), ('iris_57', 4.9), ('iris_58', 6.6), ('iris_59', 5.2), ('iris_6', 4.6), ('iris_60', 5.0), ('iris_61', 5.9), ('iris_62', 6.0), ('iris_63', 6.1), ('iris_64', 5.6), ('iris_65', 6.7), ('iris_66', 5.6), ('iris_67', 5.8), ('iris_68', 6.2), ('iris_69', 5.6), ('iris_7', 5.0), ('iris_70', 5.9), ('iris_71', 6.1), ('iris_72', 6.3), ('iris_73', 6.1), ('iris_74', 6.4), ('iris_75', 6.6), ('iris_76', 6.8), ('iris_77', 6.7), ('iris_78', 6.0), ('iris_79', 5.7), ('iris_8', 4.4), ('iris_80', 5.5), ('iris_81', 5.5), ('iris_82', 5.8), ('iris_83', 6.0), ('iris_84', 5.4), ('iris_85', 6.0), ('iris_86', 6.7), ('iris_87', 6.3), ('iris_88', 5.6), ('iris_89', 5.5), ('iris_9', 4.9), ('iris_90', 5.5), ('iris_91', 6.1), ('iris_92', 5.8), ('iris_93', 5.0), ('iris_94', 5.6), ('iris_95', 5.7), ('iris_96', 5.7), ('iris_97', 6.2), ('iris_98', 5.1), ('iris_99', 5.7)])), ('sepal_width', OrderedDict([('iris_0', 3.5), ('iris_1', 3.0), ('iris_10', 3.7), ('iris_100', 3.3), ('iris_101', 2.7), ('iris_102', 3.0), ('iris_103', 2.9), ('iris_104', 3.0), ('iris_105', 3.0), ('iris_106', 2.5), ('iris_107', 2.9), ('iris_108', 2.5), ('iris_109', 3.6), ('iris_11', 3.4), ('iris_110', 3.2), ('iris_111', 2.7), ('iris_112', 3.0), ('iris_113', 2.5), ('iris_114', 2.8), ('iris_115', 3.2), ('iris_116', 3.0), ('iris_117', 3.8), ('iris_118', 2.6), ('iris_119', 2.2), ('iris_12', 3.0), ('iris_120', 3.2), ('iris_121', 2.8), ('iris_122', 2.8), ('iris_123', 2.7), ('iris_124', 3.3), ('iris_125', 3.2), ('iris_126', 2.8), ('iris_127', 3.0), ('iris_128', 2.8), ('iris_129', 3.0), ('iris_13', 3.0), ('iris_130', 2.8), ('iris_131', 3.8), ('iris_132', 2.8), ('iris_133', 2.8), ('iris_134', 2.6), ('iris_135', 3.0), ('iris_136', 3.4), ('iris_137', 3.1), ('iris_138', 3.0), ('iris_139', 3.1), ('iris_14', 4.0), ('iris_140', 3.1), ('iris_141', 3.1), ('iris_142', 2.7), ('iris_143', 3.2), ('iris_144', 3.3), ('iris_145', 3.0), ('iris_146', 2.5), ('iris_147', 3.0), ('iris_148', 3.4), ('iris_149', 3.0), ('iris_15', 4.4), ('iris_16', 3.9), ('iris_17', 3.5), ('iris_18', 3.8), ('iris_19', 3.8), ('iris_2', 3.2), ('iris_20', 3.4), ('iris_21', 3.7), ('iris_22', 3.6), ('iris_23', 3.3), ('iris_24', 3.4), ('iris_25', 3.0), ('iris_26', 3.4), ('iris_27', 3.5), ('iris_28', 3.4), ('iris_29', 3.2), ('iris_3', 3.1), ('iris_30', 3.1), ('iris_31', 3.4), ('iris_32', 4.1), ('iris_33', 4.2), ('iris_34', 3.1), ('iris_35', 3.2), ('iris_36', 3.5), ('iris_37', 3.6), ('iris_38', 3.0), ('iris_39', 3.4), ('iris_4', 3.6), ('iris_40', 3.5), ('iris_41', 2.3), ('iris_42', 3.2), ('iris_43', 3.5), ('iris_44', 3.8), ('iris_45', 3.0), ('iris_46', 3.8), ('iris_47', 3.2), ('iris_48', 3.7), ('iris_49', 3.3), ('iris_5', 3.9), ('iris_50', 3.2), ('iris_51', 3.2), ('iris_52', 3.1), ('iris_53', 2.3), ('iris_54', 2.8), ('iris_55', 2.8), ('iris_56', 3.3), ('iris_57', 2.4), ('iris_58', 2.9), ('iris_59', 2.7), ('iris_6', 3.4), ('iris_60', 2.0), ('iris_61', 3.0), ('iris_62', 2.2), ('iris_63', 2.9), ('iris_64', 2.9), ('iris_65', 3.1), ('iris_66', 3.0), ('iris_67', 2.7), ('iris_68', 2.2), ('iris_69', 2.5), ('iris_7', 3.4), ('iris_70', 3.2), ('iris_71', 2.8), ('iris_72', 2.5), ('iris_73', 2.8), ('iris_74', 2.9), ('iris_75', 3.0), ('iris_76', 2.8), ('iris_77', 3.0), ('iris_78', 2.9), ('iris_79', 2.6), ('iris_8', 2.9), ('iris_80', 2.4), ('iris_81', 2.4), ('iris_82', 2.7), ('iris_83', 2.7), ('iris_84', 3.0), ('iris_85', 3.4), ('iris_86', 3.1), ('iris_87', 2.3), ('iris_88', 3.0), ('iris_89', 2.5), ('iris_9', 3.1), ('iris_90', 2.6), ('iris_91', 3.0), ('iris_92', 2.6), ('iris_93', 2.3), ('iris_94', 2.7), ('iris_95', 3.0), ('iris_96', 2.9), ('iris_97', 2.9), ('iris_98', 2.5), ('iris_99', 2.8)])), ('petal_length', OrderedDict([('iris_0', 1.4), ('iris_1', 1.4), ('iris_10', 1.5), ('iris_100', 6.0), ('iris_101', 5.1), ('iris_102', 5.9), ('iris_103', 5.6), ('iris_104', 5.8), ('iris_105', 6.6), ('iris_106', 4.5), ('iris_107', 6.3), ('iris_108', 5.8), ('iris_109', 6.1), ('iris_11', 1.6), ('iris_110', 5.1), ('iris_111', 5.3), ('iris_112', 5.5), ('iris_113', 5.0), ('iris_114', 5.1), ('iris_115', 5.3), ('iris_116', 5.5), ('iris_117', 6.7), ('iris_118', 6.9), ('iris_119', 5.0), ('iris_12', 1.4), ('iris_120', 5.7), ('iris_121', 4.9), ('iris_122', 6.7), ('iris_123', 4.9), ('iris_124', 5.7), ('iris_125', 6.0), ('iris_126', 4.8), ('iris_127', 4.9), ('iris_128', 5.6), ('iris_129', 5.8), ('iris_13', 1.1), ('iris_130', 6.1), ('iris_131', 6.4), ('iris_132', 5.6), ('iris_133', 5.1), ('iris_134', 5.6), ('iris_135', 6.1), ('iris_136', 5.6), ('iris_137', 5.5), ('iris_138', 4.8), ('iris_139', 5.4), ('iris_14', 1.2), ('iris_140', 5.6), ('iris_141', 5.1), ('iris_142', 5.1), ('iris_143', 5.9), ('iris_144', 5.7), ('iris_145', 5.2), ('iris_146', 5.0), ('iris_147', 5.2), ('iris_148', 5.4), ('iris_149', 5.1), ('iris_15', 1.5), ('iris_16', 1.3), ('iris_17', 1.4), ('iris_18', 1.7), ('iris_19', 1.5), ('iris_2', 1.3), ('iris_20', 1.7), ('iris_21', 1.5), ('iris_22', 1.0), ('iris_23', 1.7), ('iris_24', 1.9), ('iris_25', 1.6), ('iris_26', 1.6), ('iris_27', 1.5), ('iris_28', 1.4), ('iris_29', 1.6), ('iris_3', 1.5), ('iris_30', 1.6), ('iris_31', 1.5), ('iris_32', 1.5), ('iris_33', 1.4), ('iris_34', 1.5), ('iris_35', 1.2), ('iris_36', 1.3), ('iris_37', 1.4), ('iris_38', 1.3), ('iris_39', 1.5), ('iris_4', 1.4), ('iris_40', 1.3), ('iris_41', 1.3), ('iris_42', 1.3), ('iris_43', 1.6), ('iris_44', 1.9), ('iris_45', 1.4), ('iris_46', 1.6), ('iris_47', 1.4), ('iris_48', 1.5), ('iris_49', 1.4), ('iris_5', 1.7), ('iris_50', 4.7), ('iris_51', 4.5), ('iris_52', 4.9), ('iris_53', 4.0), ('iris_54', 4.6), ('iris_55', 4.5), ('iris_56', 4.7), ('iris_57', 3.3), ('iris_58', 4.6), ('iris_59', 3.9), ('iris_6', 1.4), ('iris_60', 3.5), ('iris_61', 4.2), ('iris_62', 4.0), ('iris_63', 4.7), ('iris_64', 3.6), ('iris_65', 4.4), ('iris_66', 4.5), ('iris_67', 4.1), ('iris_68', 4.5), ('iris_69', 3.9), ('iris_7', 1.5), ('iris_70', 4.8), ('iris_71', 4.0), ('iris_72', 4.9), ('iris_73', 4.7), ('iris_74', 4.3), ('iris_75', 4.4), ('iris_76', 4.8), ('iris_77', 5.0), ('iris_78', 4.5), ('iris_79', 3.5), ('iris_8', 1.4), ('iris_80', 3.8), ('iris_81', 3.7), ('iris_82', 3.9), ('iris_83', 5.1), ('iris_84', 4.5), ('iris_85', 4.5), ('iris_86', 4.7), ('iris_87', 4.4), ('iris_88', 4.1), ('iris_89', 4.0), ('iris_9', 1.5), ('iris_90', 4.4), ('iris_91', 4.6), ('iris_92', 4.0), ('iris_93', 3.3), ('iris_94', 4.2), ('iris_95', 4.2), ('iris_96', 4.2), ('iris_97', 4.3), ('iris_98', 3.0), ('iris_99', 4.1)])), ('petal_width', OrderedDict([('iris_0', 0.2), ('iris_1', 0.2), ('iris_10', 0.2), ('iris_100', 2.5), ('iris_101', 1.9), ('iris_102', 2.1), ('iris_103', 1.8), ('iris_104', 2.2), ('iris_105', 2.1), ('iris_106', 1.7), ('iris_107', 1.8), ('iris_108', 1.8), ('iris_109', 2.5), ('iris_11', 0.2), ('iris_110', 2.0), ('iris_111', 1.9), ('iris_112', 2.1), ('iris_113', 2.0), ('iris_114', 2.4), ('iris_115', 2.3), ('iris_116', 1.8), ('iris_117', 2.2), ('iris_118', 2.3), ('iris_119', 1.5), ('iris_12', 0.1), ('iris_120', 2.3), ('iris_121', 2.0), ('iris_122', 2.0), ('iris_123', 1.8), ('iris_124', 2.1), ('iris_125', 1.8), ('iris_126', 1.8), ('iris_127', 1.8), ('iris_128', 2.1), ('iris_129', 1.6), ('iris_13', 0.1), ('iris_130', 1.9), ('iris_131', 2.0), ('iris_132', 2.2), ('iris_133', 1.5), ('iris_134', 1.4), ('iris_135', 2.3), ('iris_136', 2.4), ('iris_137', 1.8), ('iris_138', 1.8), ('iris_139', 2.1), ('iris_14', 0.2), ('iris_140', 2.4), ('iris_141', 2.3), ('iris_142', 1.9), ('iris_143', 2.3), ('iris_144', 2.5), ('iris_145', 2.3), ('iris_146', 1.9), ('iris_147', 2.0), ('iris_148', 2.3), ('iris_149', 1.8), ('iris_15', 0.4), ('iris_16', 0.4), ('iris_17', 0.3), ('iris_18', 0.3), ('iris_19', 0.3), ('iris_2', 0.2), ('iris_20', 0.2), ('iris_21', 0.4), ('iris_22', 0.2), ('iris_23', 0.5), ('iris_24', 0.2), ('iris_25', 0.2), ('iris_26', 0.4), ('iris_27', 0.2), ('iris_28', 0.2), ('iris_29', 0.2), ('iris_3', 0.2), ('iris_30', 0.2), ('iris_31', 0.4), ('iris_32', 0.1), ('iris_33', 0.2), ('iris_34', 0.2), ('iris_35', 0.2), ('iris_36', 0.2), ('iris_37', 0.1), ('iris_38', 0.2), ('iris_39', 0.2), ('iris_4', 0.2), ('iris_40', 0.3), ('iris_41', 0.3), ('iris_42', 0.2), ('iris_43', 0.6), ('iris_44', 0.4), ('iris_45', 0.3), ('iris_46', 0.2), ('iris_47', 0.2), ('iris_48', 0.2), ('iris_49', 0.2), ('iris_5', 0.4), ('iris_50', 1.4), ('iris_51', 1.5), ('iris_52', 1.5), ('iris_53', 1.3), ('iris_54', 1.5), ('iris_55', 1.3), ('iris_56', 1.6), ('iris_57', 1.0), ('iris_58', 1.3), ('iris_59', 1.4), ('iris_6', 0.3), ('iris_60', 1.0), ('iris_61', 1.5), ('iris_62', 1.0), ('iris_63', 1.4), ('iris_64', 1.3), ('iris_65', 1.4), ('iris_66', 1.5), ('iris_67', 1.0), ('iris_68', 1.5), ('iris_69', 1.1), ('iris_7', 0.2), ('iris_70', 1.8), ('iris_71', 1.3), ('iris_72', 1.5), ('iris_73', 1.2), ('iris_74', 1.3), ('iris_75', 1.4), ('iris_76', 1.4), ('iris_77', 1.7), ('iris_78', 1.5), ('iris_79', 1.0), ('iris_8', 0.2), ('iris_80', 1.1), ('iris_81', 1.0), ('iris_82', 1.2), ('iris_83', 1.6), ('iris_84', 1.5), ('iris_85', 1.6), ('iris_86', 1.5), ('iris_87', 1.3), ('iris_88', 1.3), ('iris_89', 1.3), ('iris_9', 0.1), ('iris_90', 1.2), ('iris_91', 1.4), ('iris_92', 1.2), ('iris_93', 1.0), ('iris_94', 1.3), ('iris_95', 1.2), ('iris_96', 1.3), ('iris_97', 1.3), ('iris_98', 1.1), ('iris_99', 1.3)]))]) 

# 
X_iris = pd.DataFrame(data)
idx_groupA = X_iris.index[:5]
idx_groupB = X_iris.index[-10:]



pairwise_logfc(X_iris, sorted(idx_groupA), idx_groupB).loc[("iris_90", "iris_0")]
# sepal_length    0.108934
# sepal_width    -0.428843
# petal_length    1.652077
# petal_width     2.584963
# Name: (iris_90, iris_0), dtype: float64

pairwise_logfc(X_iris, sorted(idx_groupA)[::-1], idx_groupB).loc[("iris_90", "iris_0")]
# sepal_length   -0.050626
# sepal_width     0.000000
# petal_length   -0.280108
# petal_width    -0.547488
# Name: (iris_90, iris_0), dtype: float64

【问题讨论】:

对不起,我还是有点糊涂。当我执行df_log.loc[group_A,:] 时,每个索引都应与group_A 相同,因此Ar_groupA[i,:] 应由group_A 的第i 个元素表示? 我期待它们与这条线的 b/c 相同:labels = [*itertools.product(*groups)]。我认为group_Agroup_B 会正确组合。 我觉得我可能是沟通不畅或错过了一个关键的逻辑。 df_log 是一个pd.DataFrame 并且具有X 的原始索引。 df_log.loc[group_A,:].values 的顺序应该是 group_Adf_log.loc[group_A,:]i_th 元素应该与 Ar_groupA 的第 i 个元素相同。两者对应的标签应为group_A[i]。由于itertools.product 未排序并且依赖于两个输入数组(如预期的那样),因此无论我在将idx_groupA 提供给group_A 之前是否对它进行排序,都应该是相同的。正确的?谢谢。 【参考方案1】:

更新。完全摆脱“成对配置文件”中的循环。我们可以使用numpy广播在一行中做减法:D。

import numpy as np

def pairwise_logfc(df_data, group_A, group_B):
    # Init
    X = df_data.copy()
    attr_labels = X.columns

    # Log Transform
    df_log = np.log2(X)

    # Groups
    Ar_groupA = df_log.loc[group_A,:].values
    Ar_groupB = df_log.loc[group_B,:].values

    # Pairwise profiles
    logfc_profiles = np.vstack(Ar_groupB[:, None] - Ar_groupA)

    groups = (group_B, group_A)
    labels = [*itertools.product(*groups)]
    return pd.DataFrame(logfc_profiles, index=pd.MultiIndex.from_tuples(labels,names=["group_B", "group_A"]), 
                        columns=attr_labels)

pairwise_logfc(X_iris, sorted(idx_groupA), idx_groupB).loc[("iris_90", "iris_0")]
#sepal_length    0.108934
#sepal_width    -0.428843
#petal_length    1.652077
#petal_width     2.584963
#Name: (iris_90, iris_0), dtype: float64

pairwise_logfc(X_iris, sorted(idx_groupA)[::-1], idx_groupB).loc[("iris_90", "iris_0")]
#sepal_length    0.108934
#sepal_width    -0.428843
#petal_length    1.652077
#petal_width     2.584963
#Name: (iris_90, iris_0), dtype: float64

原始修复:

您需要更改 Pairwise 配置文件逻辑的顺序。在您的原始文件中,外循环是 A,但是当您定义组时,它的 product([B, A]),所以外循环需要是 B

# Pairwise profiles
logfc_profiles = list()
for i in  range(len(group_B)):
    u = Ar_groupB[i,:]
    for j in range(len(group_A)):
        v = Ar_groupA[j,:]
        logfc_profiles.append(u - v)

【讨论】:

@O.rka 查看更新,了解减法的绝妙方法。 哇,在我的数据集上,np.new_axis 的挂墙时间从 14.5 秒下降到 4.96 秒。你能描述一下它是如何做到和我们之前用 for 循环做同样的事情的吗? @O.rka,该命令只是None 的一个花哨别名,但结果是为您的数组添加了一个新轴。因此,如果它最初是 (10,4),它将变成 (10,1,4)。这允许我们在简单的减法 (10,1,4) - (5,4) 期间使用broadcasting,这样我们最终会得到一个向量化减法而不是循环。 This question 应该提供信息。 vstack 然后将 (10,5,4) 堆叠到 (50,4),2D,所以我们可以制作 DataFrame

以上是关于为啥标签与 pandas、itertools 和 numpy 索引不一致?的主要内容,如果未能解决你的问题,请参考以下文章

为啥 itertools.groupby 可以将 NaN 分组在列表中而不是 numpy 数组中

为啥 itertools groupby 不能按预期工作? [复制]

为啥 itertools.groupby() 不起作用? [复制]

在 Python 中,为啥 itertools.cycle 需要额外的内存? [复制]

为啥使用 itertools.product 时会出现 MemoryError?

为啥 itertools.chain 比扁平化列表理解更快?