如果您需要参数化函数,如何使用 pandas apply 方法
How to use pandas apply method, in case you need a parameterized function
输入:
0 1 2
TNN R11W MSLQEMFRFPRGLLLGSVLLVASAPATL
ASTN1 E5V MALAALCALLACCWGPAAVLATAAGDVDPSK
HSPB7 H19P MSHRTSSTFRAERSFHSSHSSSSSSTSSSASRALPAQDPPMEK
CLCNKB C3Y MECFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
SZRD1 P10L MEDEEVAESWEEAADSGEIDRRLEKKL
预期输出:
0 1 2
TNN R11W MSLQEMFRFPWGLLLGSVLLVASAPATL
ASTN1 E5V NaN
HSPB7 H19P MSHRTSSTFRAERSFHSSPSSSSSSTSSSASRALPAQDPPMEK
CLCNKB C3Y MEYFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
SZRD1 P10L NaN
代码:示例
import pandas as pd
import sys
with open(file1, 'r') as mvf:
lines_1 = [line.rstrip('\n').split(',') for line in mvf]
df = pd.DataFrame(lines_1)
class CharacterReplacer:
def __init__(self,varcolname=df[1], textcolname=df[2]):
self.varcolname=varcolname
self.textcolname=textcolname
def text_replacer_informed_from_variant_column(self,row):
variant_desc=row[self.varcolname]
c, p, r = (variant_desc[0], int(variant_desc[1:-1]), variant_desc[-1])
row[self.textcolname]=CharacterReplacer.replace_a_char_in_a_pos(row[self.textcolname], c, p, r)
return row
@staticmethod
def replace_a_char_in_a_pos(target_seq, pos, fromchar, tochar):
pos=pos-1
if( target_seq[pos]!=fromchar ):
sys.stderr.write("{} does not match with {}".format(target_seq[pos],fromchar))
out_seq= target_seq[:pos]+ tochar + target_seq[pos+1:]
return out_seq
charreplacer=CharacterReplacer(df[1],df[2])
df_new = df.apply(charreplacer.text_replacer_informed_from_variant_column, axis=1)
我想用 'column 1' 的索引更改 'column 2' 中的一个字符。例如,在第一行中,如果我检查 'column 1' 的索引,我将在 'column 2' 的第 11 个字符中查找 'R'。如果字符是'R',我想把它改成'W'。如果没有,我想在单元格中写 'NaN' 。如何用 'column1'.
的信息替换 'column2' 的字符
我修改了你的代码。
设置:
import pandas as pd
import numpy as np
import sys
# Try: df = pd.read_csv('file1', header=None)
df = pd.DataFrame({0: ['TNN', 'ASTN1', 'HSPB7', 'CLCNKB', 'SZRD1'],
1: ['R11W', 'E5V', 'H19P', 'C3Y', 'P10L'],
2: ['MSLQEMFRFPRGLLLGSVLLVASAPATL',
'MALAALCALLACCWGPAAVLATAAGDVDPSK',
'MSHRTSSTFRAERSFHSSHSSSSSSTSSSASRALPAQDPPMEK',
'MECFVGLREGSSGNPVTLQELWGPCPRIRRGIRG',
'MEDEEVAESWEEAADSGEIDRRLEKKL']})
代码:
class CharacterReplacer:
def __init__(self, varcolname, textcolname):
self.varcolname = varcolname
self.textcolname = textcolname
def text_replacer_informed_from_variant_column(self, row):
variant_desc = row[self.varcolname]
c, p, r = (variant_desc[0], int(variant_desc[1:-1]), variant_desc[-1])
row[self.textcolname] = CharacterReplacer.replace_a_char_in_a_pos(row[self.textcolname], p, c, r)
return row
@staticmethod
def replace_a_char_in_a_pos(target_seq, pos, fromchar, tochar):
pos -= 1
if(target_seq[pos] != fromchar):
sys.stderr.write(f"{target_seq[pos]} does not match with {fromchar}")
return np.NaN
out_seq = f"{target_seq[:pos]}{tochar}{target_seq[pos+1:]}"
return out_seq
使用:
charreplacer = CharacterReplacer(1, 2) # 1 and 2 are the column names
out = df.apply(charreplacer.text_replacer_informed_from_variant_column, axis=1)
输出:
>>> out
0 1 2
0 TNN R11W MSLQEMFRFPWGLLLGSVLLVASAPATL
1 ASTN1 E5V NaN
2 HSPB7 H19P MSHRTSSTFRAERSFHSSPSSSSSSTSSSASRALPAQDPPMEK
3 CLCNKB C3Y MEYFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
4 SZRD1 P10L NaN
编辑
你的数据文件是这样的吗:
TNN,R11W,MSLQEMFRFPRGLLLGSVLLVASAPATL
ASTN1,E5V,MALAALCALLACCWGPAAVLATAAGDVDPSK
HSPB7,H19P,MSHRTSSTFRAERSFHSSHSSSSSSTSSSASRALPAQDPPMEK
CLCNKB,C3Y,MECFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
SZRD1,P10L,MEDEEVAESWEEAADSGEIDRRLEKKL
输入:
0 1 2
TNN R11W MSLQEMFRFPRGLLLGSVLLVASAPATL
ASTN1 E5V MALAALCALLACCWGPAAVLATAAGDVDPSK
HSPB7 H19P MSHRTSSTFRAERSFHSSHSSSSSSTSSSASRALPAQDPPMEK
CLCNKB C3Y MECFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
SZRD1 P10L MEDEEVAESWEEAADSGEIDRRLEKKL
预期输出:
0 1 2
TNN R11W MSLQEMFRFPWGLLLGSVLLVASAPATL
ASTN1 E5V NaN
HSPB7 H19P MSHRTSSTFRAERSFHSSPSSSSSSTSSSASRALPAQDPPMEK
CLCNKB C3Y MEYFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
SZRD1 P10L NaN
代码:示例
import pandas as pd
import sys
with open(file1, 'r') as mvf:
lines_1 = [line.rstrip('\n').split(',') for line in mvf]
df = pd.DataFrame(lines_1)
class CharacterReplacer:
def __init__(self,varcolname=df[1], textcolname=df[2]):
self.varcolname=varcolname
self.textcolname=textcolname
def text_replacer_informed_from_variant_column(self,row):
variant_desc=row[self.varcolname]
c, p, r = (variant_desc[0], int(variant_desc[1:-1]), variant_desc[-1])
row[self.textcolname]=CharacterReplacer.replace_a_char_in_a_pos(row[self.textcolname], c, p, r)
return row
@staticmethod
def replace_a_char_in_a_pos(target_seq, pos, fromchar, tochar):
pos=pos-1
if( target_seq[pos]!=fromchar ):
sys.stderr.write("{} does not match with {}".format(target_seq[pos],fromchar))
out_seq= target_seq[:pos]+ tochar + target_seq[pos+1:]
return out_seq
charreplacer=CharacterReplacer(df[1],df[2])
df_new = df.apply(charreplacer.text_replacer_informed_from_variant_column, axis=1)
我想用 'column 1' 的索引更改 'column 2' 中的一个字符。例如,在第一行中,如果我检查 'column 1' 的索引,我将在 'column 2' 的第 11 个字符中查找 'R'。如果字符是'R',我想把它改成'W'。如果没有,我想在单元格中写 'NaN' 。如何用 'column1'.
的信息替换 'column2' 的字符我修改了你的代码。
设置:
import pandas as pd
import numpy as np
import sys
# Try: df = pd.read_csv('file1', header=None)
df = pd.DataFrame({0: ['TNN', 'ASTN1', 'HSPB7', 'CLCNKB', 'SZRD1'],
1: ['R11W', 'E5V', 'H19P', 'C3Y', 'P10L'],
2: ['MSLQEMFRFPRGLLLGSVLLVASAPATL',
'MALAALCALLACCWGPAAVLATAAGDVDPSK',
'MSHRTSSTFRAERSFHSSHSSSSSSTSSSASRALPAQDPPMEK',
'MECFVGLREGSSGNPVTLQELWGPCPRIRRGIRG',
'MEDEEVAESWEEAADSGEIDRRLEKKL']})
代码:
class CharacterReplacer:
def __init__(self, varcolname, textcolname):
self.varcolname = varcolname
self.textcolname = textcolname
def text_replacer_informed_from_variant_column(self, row):
variant_desc = row[self.varcolname]
c, p, r = (variant_desc[0], int(variant_desc[1:-1]), variant_desc[-1])
row[self.textcolname] = CharacterReplacer.replace_a_char_in_a_pos(row[self.textcolname], p, c, r)
return row
@staticmethod
def replace_a_char_in_a_pos(target_seq, pos, fromchar, tochar):
pos -= 1
if(target_seq[pos] != fromchar):
sys.stderr.write(f"{target_seq[pos]} does not match with {fromchar}")
return np.NaN
out_seq = f"{target_seq[:pos]}{tochar}{target_seq[pos+1:]}"
return out_seq
使用:
charreplacer = CharacterReplacer(1, 2) # 1 and 2 are the column names
out = df.apply(charreplacer.text_replacer_informed_from_variant_column, axis=1)
输出:
>>> out
0 1 2
0 TNN R11W MSLQEMFRFPWGLLLGSVLLVASAPATL
1 ASTN1 E5V NaN
2 HSPB7 H19P MSHRTSSTFRAERSFHSSPSSSSSSTSSSASRALPAQDPPMEK
3 CLCNKB C3Y MEYFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
4 SZRD1 P10L NaN
编辑
你的数据文件是这样的吗:
TNN,R11W,MSLQEMFRFPRGLLLGSVLLVASAPATL
ASTN1,E5V,MALAALCALLACCWGPAAVLATAAGDVDPSK
HSPB7,H19P,MSHRTSSTFRAERSFHSSHSSSSSSTSSSASRALPAQDPPMEK
CLCNKB,C3Y,MECFVGLREGSSGNPVTLQELWGPCPRIRRGIRG
SZRD1,P10L,MEDEEVAESWEEAADSGEIDRRLEKKL