如果您需要参数化函数,如何使用 pandas apply 方法

How to use pandas apply method, in case you need a parameterized function

输入:

0      1     2
TNN    R11W  MSLQEMFRFPRGLLLGSVLLVASAPATL
ASTN1  E5V   MALAALCALLACCWGPAAVLATAAGDVDPSK
HSPB7  H19P  MSHRTSSTFRAERSFHSSHSSSSSSTSSSASRALPAQDPPMEK
CLCNKB C3Y   MECFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
SZRD1  P10L  MEDEEVAESWEEAADSGEIDRRLEKKL

预期输出:

0      1     2
TNN    R11W  MSLQEMFRFPWGLLLGSVLLVASAPATL
ASTN1  E5V   NaN
HSPB7  H19P  MSHRTSSTFRAERSFHSSPSSSSSSTSSSASRALPAQDPPMEK
CLCNKB C3Y   MEYFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
SZRD1  P10L  NaN

代码:示例

import pandas as pd
import sys

with open(file1, 'r') as mvf:
    lines_1 = [line.rstrip('\n').split(',') for line in mvf]

df = pd.DataFrame(lines_1)

class CharacterReplacer:
    def __init__(self,varcolname=df[1], textcolname=df[2]):
        self.varcolname=varcolname
        self.textcolname=textcolname

    def text_replacer_informed_from_variant_column(self,row):
        variant_desc=row[self.varcolname]
        c, p, r =  (variant_desc[0], int(variant_desc[1:-1]), variant_desc[-1])
        row[self.textcolname]=CharacterReplacer.replace_a_char_in_a_pos(row[self.textcolname], c, p, r)
        return row

    @staticmethod
    def replace_a_char_in_a_pos(target_seq, pos, fromchar, tochar):
        pos=pos-1
        if( target_seq[pos]!=fromchar ):
            sys.stderr.write("{} does not match with {}".format(target_seq[pos],fromchar))

        out_seq= target_seq[:pos]+  tochar + target_seq[pos+1:]
        return out_seq

charreplacer=CharacterReplacer(df[1],df[2])
df_new = df.apply(charreplacer.text_replacer_informed_from_variant_column, axis=1)

我想用 'column 1' 的索引更改 'column 2' 中的一个字符。例如,在第一行中,如果我检查 'column 1' 的索引,我将在 'column 2' 的第 11 个字符中查找 'R'。如果字符是'R',我想把它改成'W'。如果没有,我想在单元格中写 'NaN' 。如何用 'column1'.

的信息替换 'column2' 的字符

我修改了你的代码。

设置:

import pandas as pd
import numpy as np
import sys

# Try: df = pd.read_csv('file1', header=None)
df = pd.DataFrame({0: ['TNN', 'ASTN1', 'HSPB7', 'CLCNKB', 'SZRD1'],
                   1: ['R11W', 'E5V', 'H19P', 'C3Y', 'P10L'],
                   2: ['MSLQEMFRFPRGLLLGSVLLVASAPATL',
                       'MALAALCALLACCWGPAAVLATAAGDVDPSK',
                       'MSHRTSSTFRAERSFHSSHSSSSSSTSSSASRALPAQDPPMEK',
                       'MECFVGLREGSSGNPVTLQELWGPCPRIRRGIRG',
                       'MEDEEVAESWEEAADSGEIDRRLEKKL']})

代码:

class CharacterReplacer:
    def __init__(self, varcolname, textcolname):
        self.varcolname = varcolname
        self.textcolname = textcolname

    def text_replacer_informed_from_variant_column(self, row):
        variant_desc = row[self.varcolname]
        c, p, r =  (variant_desc[0], int(variant_desc[1:-1]), variant_desc[-1])
        row[self.textcolname] = CharacterReplacer.replace_a_char_in_a_pos(row[self.textcolname], p, c, r)
        return row

    @staticmethod
    def replace_a_char_in_a_pos(target_seq, pos, fromchar, tochar):
        pos -= 1
        if(target_seq[pos] != fromchar):
            sys.stderr.write(f"{target_seq[pos]} does not match with {fromchar}")
            return np.NaN

        out_seq = f"{target_seq[:pos]}{tochar}{target_seq[pos+1:]}"
        return out_seq

使用:

charreplacer = CharacterReplacer(1, 2)  # 1 and 2 are the column names
out = df.apply(charreplacer.text_replacer_informed_from_variant_column, axis=1)

输出:

>>> out
        0     1                                            2
0     TNN  R11W                 MSLQEMFRFPWGLLLGSVLLVASAPATL
1   ASTN1   E5V                                          NaN
2   HSPB7  H19P  MSHRTSSTFRAERSFHSSPSSSSSSTSSSASRALPAQDPPMEK
3  CLCNKB   C3Y           MEYFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
4   SZRD1  P10L                                          NaN

编辑

你的数据文件是这样的吗:

TNN,R11W,MSLQEMFRFPRGLLLGSVLLVASAPATL
ASTN1,E5V,MALAALCALLACCWGPAAVLATAAGDVDPSK
HSPB7,H19P,MSHRTSSTFRAERSFHSSHSSSSSSTSSSASRALPAQDPPMEK
CLCNKB,C3Y,MECFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
SZRD1,P10L,MEDEEVAESWEEAADSGEIDRRLEKKL