如何将蛋白质序列转换为 python 中的一种热编码?
How to convert Protein sequence to one hot encoding in python?
我使用此代码对我的序列进行单热编码,但它仅适用于单个序列,不适用于包含我的序列的 CSV 文件,我该怎么做?
这是代码
data = 'MGILPSPGMPALLSLVSLLSVLLMGCVAETGTQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNEVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPSGAGSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDPPEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKGSGRENLYFQGGGGSGYIPEAPRDGQAYVRKDGEWVLLSTFLGHHHHHHHH'
alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
integer_encoded = [char_to_int[char] for char in data]
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
e = np.array(onehot_encoded)
此代码适用于单个序列,但当我想使用我的整个数据集(CSV 文件)时,它不起作用。
data = pd.read_csv("database.csv", usecols=[4])
这是我的 database.csv 样子
这是列[4]
Sequence
0 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
1 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
2 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
3 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
4 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
5 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
6 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
7 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
8 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
.
.
.
40 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
41 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
42 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
43 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
44 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
45 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
这是错误
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
in <module>
5 char_to_int = dict((c, i) for i, c in enumerate(alphabet))
6
----> 7 integer_encoded = [char_to_int[char] for char in data]
8
9 onehot_encoded = list()
5 char_to_int = dict((c, i) for i, c in enumerate(alphabet))
6
----> 7 integer_encoded = [char_to_int[char] for char in data]
8
9 onehot_encoded = list()
KeyError: 'Sequence'
这是使用这段代码后的错误
# Import Dependencies
import pandas as pd
import numpy as np
# Function to encode sequences
def encode_seq(sequence):
alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
integer_encoded = [char_to_int[char] for char in data]
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
return np.array(onehot_encoded)
# Read .csv
df = pd.read_csv('database.csv')
# Keep only Sequence column
df = df.filter(['Sequence'])
# Create a new column Encoded_Sequences containing encoded sequences from Sequence column
df['Encoded_Sequences'] = df['Sequence'].apply(lambda x: encode_seq(x))
KeyError Traceback (most recent call last)
in <module>
25
26 # Create a new column Encoded_Sequences containing encoded sequences from Sequence column
---> 27 df['Encoded_Sequences'] = df['Sequence'].apply(lambda x: encode_seq(x))
in apply(self, func, convert_dtype, args, **kwargs)
4355 dtype: float64
4356 """
-> 4357 return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
4358
4359 def _reduce(
in apply(self)
1041 return self.apply_str()
1042
-> 1043 return self.apply_standard()
1044
1045 def agg(self):
in apply_standard(self)
1097 # List[Union[Callable[..., Any], str]]]]]"; expected
1098 # "Callable[[Any], Any]"
-> 1099 mapped = lib.map_infer(
1100 values,
1101 f, # type: ignore[arg-type]
in pandas._libs.lib.map_infer()
in <lambda>(x)
25
26 # Create a new column Encoded_Sequences containing encoded sequences from Sequence column
---> 27 df['Encoded_Sequences'] = df['Sequence'].apply(lambda x: encode_seq(x))
in encode_seq(sequence)
7 alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
8 char_to_int = dict((c, i) for i, c in enumerate(alphabet))
----> 9 integer_encoded = [char_to_int[char] for char in data]
10 onehot_encoded = list()
11
in <listcomp>(.0)
7 alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
8 char_to_int = dict((c, i) for i, c in enumerate(alphabet))
----> 9 integer_encoded = [char_to_int[char] for char in data]
10 onehot_encoded = list()
11
KeyError: 'Sequence'
我需要乘以另一个矩阵,所以我需要对它们进行单热编码并将它们转换为 Numpy 数组,但它不适用于此代码,所以我该怎么办?
# Import Dependencies
import pandas as pd
import numpy as np
# Function to encode sequences
def encode_seq(sequence):
alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
integer_encoded = [char_to_int[char] for char in sequence]
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
return np.array(onehot_encoded)
# Read .csv
df = pd.read_csv('database.csv')
# Keep only Sequence column
df = df.filter(['Sequence'])
# Create a new column Encoded_Sequences containing encoded sequences from Sequence column
df['Encoded_Sequences'] = df['Sequence'].apply(lambda x: encode_seq(x))
我使用此代码对我的序列进行单热编码,但它仅适用于单个序列,不适用于包含我的序列的 CSV 文件,我该怎么做? 这是代码
data = 'MGILPSPGMPALLSLVSLLSVLLMGCVAETGTQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNEVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPSGAGSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDPPEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKGSGRENLYFQGGGGSGYIPEAPRDGQAYVRKDGEWVLLSTFLGHHHHHHHH'
alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
integer_encoded = [char_to_int[char] for char in data]
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
e = np.array(onehot_encoded)
此代码适用于单个序列,但当我想使用我的整个数据集(CSV 文件)时,它不起作用。
data = pd.read_csv("database.csv", usecols=[4])
这是我的 database.csv 样子
Sequence
0 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
1 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
2 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
3 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
4 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
5 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
6 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
7 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
8 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
.
.
.
40 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
41 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
42 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
43 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
44 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
45 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...
这是错误
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
in <module>
5 char_to_int = dict((c, i) for i, c in enumerate(alphabet))
6
----> 7 integer_encoded = [char_to_int[char] for char in data]
8
9 onehot_encoded = list()
5 char_to_int = dict((c, i) for i, c in enumerate(alphabet))
6
----> 7 integer_encoded = [char_to_int[char] for char in data]
8
9 onehot_encoded = list()
KeyError: 'Sequence'
这是使用这段代码后的错误
# Import Dependencies
import pandas as pd
import numpy as np
# Function to encode sequences
def encode_seq(sequence):
alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
integer_encoded = [char_to_int[char] for char in data]
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
return np.array(onehot_encoded)
# Read .csv
df = pd.read_csv('database.csv')
# Keep only Sequence column
df = df.filter(['Sequence'])
# Create a new column Encoded_Sequences containing encoded sequences from Sequence column
df['Encoded_Sequences'] = df['Sequence'].apply(lambda x: encode_seq(x))
KeyError Traceback (most recent call last)
in <module>
25
26 # Create a new column Encoded_Sequences containing encoded sequences from Sequence column
---> 27 df['Encoded_Sequences'] = df['Sequence'].apply(lambda x: encode_seq(x))
in apply(self, func, convert_dtype, args, **kwargs)
4355 dtype: float64
4356 """
-> 4357 return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
4358
4359 def _reduce(
in apply(self)
1041 return self.apply_str()
1042
-> 1043 return self.apply_standard()
1044
1045 def agg(self):
in apply_standard(self)
1097 # List[Union[Callable[..., Any], str]]]]]"; expected
1098 # "Callable[[Any], Any]"
-> 1099 mapped = lib.map_infer(
1100 values,
1101 f, # type: ignore[arg-type]
in pandas._libs.lib.map_infer()
in <lambda>(x)
25
26 # Create a new column Encoded_Sequences containing encoded sequences from Sequence column
---> 27 df['Encoded_Sequences'] = df['Sequence'].apply(lambda x: encode_seq(x))
in encode_seq(sequence)
7 alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
8 char_to_int = dict((c, i) for i, c in enumerate(alphabet))
----> 9 integer_encoded = [char_to_int[char] for char in data]
10 onehot_encoded = list()
11
in <listcomp>(.0)
7 alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
8 char_to_int = dict((c, i) for i, c in enumerate(alphabet))
----> 9 integer_encoded = [char_to_int[char] for char in data]
10 onehot_encoded = list()
11
KeyError: 'Sequence'
我需要乘以另一个矩阵,所以我需要对它们进行单热编码并将它们转换为 Numpy 数组,但它不适用于此代码,所以我该怎么办?
# Import Dependencies
import pandas as pd
import numpy as np
# Function to encode sequences
def encode_seq(sequence):
alphabet = ['A', 'C', 'D', 'E', 'F', 'G','H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
integer_encoded = [char_to_int[char] for char in sequence]
onehot_encoded = list()
for value in integer_encoded:
letter = [0 for _ in range(len(alphabet))]
letter[value] = 1
onehot_encoded.append(letter)
return np.array(onehot_encoded)
# Read .csv
df = pd.read_csv('database.csv')
# Keep only Sequence column
df = df.filter(['Sequence'])
# Create a new column Encoded_Sequences containing encoded sequences from Sequence column
df['Encoded_Sequences'] = df['Sequence'].apply(lambda x: encode_seq(x))