如何使用 SQL 通过分组和 CASE 连接两行
How to use SQL to concatenate two rows with grouping and CASE
我在 SQL Server 2016 Express 中有一些基因组数据,它目前采用长格式,包含参考基因组和测试基因组,由 SubjectID
、基因和密码子分割(例如 3 元组)。
我真正需要的是将我的数据重塑为元组连接在一起的数据,但仅当元组中存在突变(与参考基因组相比)时。这将是一种对每个人都更有用的格式。
我的数据是这样的
DECLARE @myTable TABLE
(
SubjectID VARCHAR(MAX),
country VARCHAR(MAX),
gene VARCHAR(MAX),
position INT,
ReferenceNucleotide VARCHAR(1),
TestNucleotide VARCHAR(1),
codon INT,
nucleotide_order INT
)
INSERT INTO @myTable
VALUES
('1-0002','India','gyrA', 65,'A','x', 92,1),
('1-0002','India','gyrA', 66,'T','x', 92,2),
('1-0002','India','gyrA', 67,'C','C', 92,3),
('1-0002','India','gyrA', 68,'T','T', 93,1),
('1-0002','India','gyrA', 69,'A','A', 93,2),
('1-0002','India','gyrA', 70,'C','C', 93,3),
('1-0002','India','gyrA', 71,'G','G', 94,1),
('1-0002','India','gyrA', 72,'A','A', 94,2),
('1-0002','India','gyrA', 73,'C','C', 94,3),
('1-0002','India','gyrA', 74,'A','A', 95,1),
('1-0002','India','gyrA', 75,'G','C', 95,2),
('1-0002','India','gyrA', 76,'C','C', 95,3),
('1-0002','India','gyrA', 77,'C','C', 96,1),
('1-0002','India','gyrA', 78,'T','T', 96,2),
('1-0002','India','gyrA', 79,'G','N', 96,3)
但是,有几个条件
- 如果参考基因组和测试基因组的所有三个核苷酸都相同,我想要 'WT'
- 如果核苷酸有任何差异,我想要测试基因组的三元组(按核苷酸顺序)
- 我需要按 SubjectID 和基因分组,因为我有很多受试者和基因
我的结果看起来像
1-0002 India gyrA 92 xxC
1-0002 India gyrA 93 WT
1-0002 India gyrA 94 WT
1-0002 India gyrA 95 ACC
1-0002 India gyrA 96 CTN
我可以确定需要三元组的密码子在哪里,但我正在为如何连接它们而苦苦挣扎
DECLARE @myCodons TABLE (SubjectID varchar(max), country varchar(max), gene varchar(max), codon int, WT int)
INSERT INTO @myCodons
SELECT
SubjectID, country, gene, codon,
SUM(CASE WHEN RefNucleotide=TestNucleotide THEN 0 ELSE 1 END) AS WT
FROM
@myTable
GROUP BY
SubjectID, country, gene, codon
SELECT *
FROM @myCodons
ORDER BY codon
从以下内容开始:
select T1.SubjectID, T1.country, T1.gene, T1.codon,
T1.RefGenome + T2.RefGenome + T3.RefGenome RefGenome,
T1.TestGenome + T2.TestGenome + T3.TestGenome TestGenome
from @myTable T1
inner join @myTable T2 on T1.SubjectID = T2.SubjectID and T1.country = T2.country
and T1.gene = T2.gene and T1.codon = T2.codon and T2.nucleotide_order = 2
inner join @myTable T3 on T1.SubjectID = T3.SubjectID and T1.country = T3.country
and T1.gene = T3.gene and T1.codon = T3.codon and T3.nucleotide_order = 3
where T1.nucleotide_order = 1
然后您可以在此基础上使用 case 语句来确定是显示测试基因组还是 'WT'
可能有用的替代方法:
select SubjectID, country, gene, codon, case when RefGenomeStr = TestGenomeStr then 'WT' else TestGenomeStr end wanted_string
from @myTable t1
cross apply(
SELECT
STUFF((
SELECT
', ' +RefGenome
FROM @myTable t2
WHERE t2.SubjectID= t1.SubjectID and t2.country = t1.country and t2.gene = t1.gene and t2.codon = t1.codon
FOR XML PATH ('')
)
, 1, 1, '')
, STUFF((
SELECT
', ' +TestGenome
FROM @myTable t2
WHERE t2.SubjectID= t1.SubjectID and t2.country = t1.country and t2.gene = t1.gene and t2.codon = t1.codon
FOR XML PATH ('')
)
, 1, 1, '')
) ca (RefGenomeStr,TestGenomeStr)
where nucleotide_order = 1
结果:
+----+-----------+---------+------+-------+---------------+
| | SubjectID | country | gene | codon | wanted_string |
+----+-----------+---------+------+-------+---------------+
| 1 | 1-0002 | India | gyrA | 92 | x, x, C |
| 2 | 1-0002 | India | gyrA | 93 | WT |
| 3 | 1-0002 | India | gyrA | 94 | WT |
| 4 | 1-0002 | India | gyrA | 95 | A, C, C |
| 5 | 1-0002 | India | gyrA | 96 | C, T, N |
+----+-----------+---------+------+-------+---------------+
我在 SQL Server 2016 Express 中有一些基因组数据,它目前采用长格式,包含参考基因组和测试基因组,由 SubjectID
、基因和密码子分割(例如 3 元组)。
我真正需要的是将我的数据重塑为元组连接在一起的数据,但仅当元组中存在突变(与参考基因组相比)时。这将是一种对每个人都更有用的格式。
我的数据是这样的
DECLARE @myTable TABLE
(
SubjectID VARCHAR(MAX),
country VARCHAR(MAX),
gene VARCHAR(MAX),
position INT,
ReferenceNucleotide VARCHAR(1),
TestNucleotide VARCHAR(1),
codon INT,
nucleotide_order INT
)
INSERT INTO @myTable
VALUES
('1-0002','India','gyrA', 65,'A','x', 92,1),
('1-0002','India','gyrA', 66,'T','x', 92,2),
('1-0002','India','gyrA', 67,'C','C', 92,3),
('1-0002','India','gyrA', 68,'T','T', 93,1),
('1-0002','India','gyrA', 69,'A','A', 93,2),
('1-0002','India','gyrA', 70,'C','C', 93,3),
('1-0002','India','gyrA', 71,'G','G', 94,1),
('1-0002','India','gyrA', 72,'A','A', 94,2),
('1-0002','India','gyrA', 73,'C','C', 94,3),
('1-0002','India','gyrA', 74,'A','A', 95,1),
('1-0002','India','gyrA', 75,'G','C', 95,2),
('1-0002','India','gyrA', 76,'C','C', 95,3),
('1-0002','India','gyrA', 77,'C','C', 96,1),
('1-0002','India','gyrA', 78,'T','T', 96,2),
('1-0002','India','gyrA', 79,'G','N', 96,3)
但是,有几个条件
- 如果参考基因组和测试基因组的所有三个核苷酸都相同,我想要 'WT'
- 如果核苷酸有任何差异,我想要测试基因组的三元组(按核苷酸顺序)
- 我需要按 SubjectID 和基因分组,因为我有很多受试者和基因
我的结果看起来像
1-0002 India gyrA 92 xxC
1-0002 India gyrA 93 WT
1-0002 India gyrA 94 WT
1-0002 India gyrA 95 ACC
1-0002 India gyrA 96 CTN
我可以确定需要三元组的密码子在哪里,但我正在为如何连接它们而苦苦挣扎
DECLARE @myCodons TABLE (SubjectID varchar(max), country varchar(max), gene varchar(max), codon int, WT int)
INSERT INTO @myCodons
SELECT
SubjectID, country, gene, codon,
SUM(CASE WHEN RefNucleotide=TestNucleotide THEN 0 ELSE 1 END) AS WT
FROM
@myTable
GROUP BY
SubjectID, country, gene, codon
SELECT *
FROM @myCodons
ORDER BY codon
从以下内容开始:
select T1.SubjectID, T1.country, T1.gene, T1.codon,
T1.RefGenome + T2.RefGenome + T3.RefGenome RefGenome,
T1.TestGenome + T2.TestGenome + T3.TestGenome TestGenome
from @myTable T1
inner join @myTable T2 on T1.SubjectID = T2.SubjectID and T1.country = T2.country
and T1.gene = T2.gene and T1.codon = T2.codon and T2.nucleotide_order = 2
inner join @myTable T3 on T1.SubjectID = T3.SubjectID and T1.country = T3.country
and T1.gene = T3.gene and T1.codon = T3.codon and T3.nucleotide_order = 3
where T1.nucleotide_order = 1
然后您可以在此基础上使用 case 语句来确定是显示测试基因组还是 'WT'
可能有用的替代方法:
select SubjectID, country, gene, codon, case when RefGenomeStr = TestGenomeStr then 'WT' else TestGenomeStr end wanted_string
from @myTable t1
cross apply(
SELECT
STUFF((
SELECT
', ' +RefGenome
FROM @myTable t2
WHERE t2.SubjectID= t1.SubjectID and t2.country = t1.country and t2.gene = t1.gene and t2.codon = t1.codon
FOR XML PATH ('')
)
, 1, 1, '')
, STUFF((
SELECT
', ' +TestGenome
FROM @myTable t2
WHERE t2.SubjectID= t1.SubjectID and t2.country = t1.country and t2.gene = t1.gene and t2.codon = t1.codon
FOR XML PATH ('')
)
, 1, 1, '')
) ca (RefGenomeStr,TestGenomeStr)
where nucleotide_order = 1
结果:
+----+-----------+---------+------+-------+---------------+
| | SubjectID | country | gene | codon | wanted_string |
+----+-----------+---------+------+-------+---------------+
| 1 | 1-0002 | India | gyrA | 92 | x, x, C |
| 2 | 1-0002 | India | gyrA | 93 | WT |
| 3 | 1-0002 | India | gyrA | 94 | WT |
| 4 | 1-0002 | India | gyrA | 95 | A, C, C |
| 5 | 1-0002 | India | gyrA | 96 | C, T, N |
+----+-----------+---------+------+-------+---------------+