如何在SQL服务器中通过行比较来计算一致度?
How to calculate the degree of agreement by row comparisons in SQL Server?
对于最小的、可重现的示例 (reprex),我们假设我在 Microsoft SQL 服务器中有一个数据库对象 (dbo),并且我想在 T-SQL.[= 中查询内容15=]
我的 dbo 看起来像这样:
Animal-ID Marker-ID Allele1 Allele2
--------------------------------------------
1 OAR1 A G
1 OAR2 C C
1 OAR3 T G
2 OAR1 A C
2 OAR2 C C
2 OAR3 A C
我想做的是计算所有动物 ID 中每个标记 ID 的等位基因匹配百分比。
给定上面的 dbo 示例,所需的结果如下所示:
Animal-ID-pair Marker-ID Match-percentage
--------------------------------------------
1-2 OAR1 50
1-2 OAR2 100
1-2 OAR3 0
到目前为止,我尝试了以下方法:
起初我认为选择单独的行就足够了。
SELECT *
FROM
(SELECT
ROW_NUMBER() OVER (ORDER BY Animal-ID ASC) AS rownumber,
Animal-ID, Marker-ID,
Allele1, Allele2
FROM
dbo) AS foo
WHERE
rownumber BETWEEN 1 AND 3;
然后将其与 4 到 6 之间的范围进行比较。
这里的问题是,在我的真实数据集中,并不是所有的动物 ID 对都具有相同的行数,即标记的数量不同。
这就是我认为分组可能有用的原因:
SELECT
Animal-ID, Marker-ID,
Allele1, Allele2
FROM
dbo
WHERE
Animal-ID IN (SELECT Animal-ID FROM dbo
GROUP BY Animal-ID
HAVING COUNT(*) > 1);
但这不允许我进行跨组比较 and/or 计算。
所以想请教一下行对比较中的一致度如何计算
通过使用 SUBQUERY & STUFF
DECLARE @T TABLE(Animal_ID INT, Marker_ID CHAR(10) , Allele1 CHAR, Allele2 CHAR)
INSERT INTO @T VALUES
(1,'OAR1','A','G'),
(1,'OAR2','C','C'),
(1,'OAR3','T','G'),
(2,'OAR1','A','C'),
(2,'OAR2','C','C'),
(2,'OAR3','A','C')
SELECT * FROM @T
SELECT S.*,(ISNULL(S1.C,0)+ISNULL(S2.C,0))*100/LEN(Allele_Pair) AS Percentage
FROM (
SELECT STUFF((SELECT CONCAT('-' , Animal_ID ) FROM @T t1
WHERE t1.Marker_ID = t2.Marker_ID FOR XML PATH ('')), 1, 1, '') AS Animal_ID_Pair
,Marker_ID,
STUFF((SELECT CONCAT(Allele1,Allele2) FROM @T t1
WHERE t1.Marker_ID = t2.Marker_ID FOR XML PATH ('')), 1, 0, '') AS Allele_Pair
FROM @T t2
GROUP BY Marker_ID) S
LEFT JOIN (SELECT Marker_ID,Allele2,COUNT(Allele2) AS C FROm @T GROUP BY Allele2,Marker_ID HAVING COUNT(Allele2)>1) S1 ON S1.Marker_ID=S.Marker_ID
LEFT JOIN (SELECT Marker_ID,Allele1,COUNT(Allele1) AS C FROm @T GROUP BY Allele1,Marker_ID HAVING COUNT(Allele1)>1) S2 ON S2.Marker_ID=S.Marker_ID
输出:
Animal_ID_Pair Marker_ID Allele_Pair Percentage
1-2 OAR1 AGAC 50
1-2 OAR2 CCCC 100
1-2 OAR3 TGAC 0
自连接做你想做的——用一些算术:
select t1.animal_id, t2.animal_id,
( case when t1.allele1 = t2.allele1 then 1.0 else 0 end +
case when t1.allele2 = t2.allele2 then 1.0 else 0 end +
) / 2.0 as match_percentage
from t t1 join
t t2
on t1.marker_id = t2.marker_id and
t1.animal_id < t2.animal_id;
尽管向其中添加新的等位基因很容易。您还可以通过取消旋转等位基因并聚合来表达这一点:
with ta as (
select t.*,, v.*
from t cross apply
(values (1, allele1), (2, allele2)) v(allele, val)
)
select ta1.animal_id, ta2.animal_id, mta1.marker,
avg(case when ta1.val = ta2.val then 1.0 else 0 end) as match_percentage
from ta ta1 join
ta ta2
on ta1.marker_id = ta2.marker_id and
ta1.animal_id < ta2.animal_id
group by ta1.animal_id, ta2.animal_id;
示例数据
create table genomes
(
AnimalId int,
MarkerId nvarchar(10),
Allele1 nvarchar(1),
Allele2 nvarchar(2)
)
insert into genomes (AnimalId, MarkerId, Allele1, Allele2) values
(1, 'OAR1', 'A', 'G'),
(1, 'OAR2', 'C', 'C'),
(1, 'OAR3', 'T', 'G'),
(2, 'OAR1', 'A', 'C'),
(2, 'OAR2', 'C', 'C'),
(2, 'OAR3', 'A', 'C'),
(3, 'OAR1', 'A', 'G'), --new sample Animal with less data (no OAR3)
(3, 'OAR2', 'C', 'G');
解决方案
- Select 所有独特的动物
cte_AllAnimals
.
- Select 所有唯一标记
cte_AllMarkers
.
- 将每只动物与其后面的每只动物合并
a2.AnimalId > a1.AnimalId
。这将为您提供所有独特的动物组合。
- 将每对与每个标记组合
cross join cte_AllMarkers
。
这给了我:
with cte_AllMarkers as
(
select g.MarkerId
from genomes g
group by g.MarkerId
),
cte_AllAnimals as
(
select g.AnimalId
from genomes g
group by g.AnimalId
)
select convert(nvarchar(10), a1.AnimalId) + '-' +
convert(nvarchar(10), a2.AnimalId) as AnimalIdPair,
m.MarkerId,
case g1.Allele1 when g2.Allele1 then 50 else 0 end +
case g1.Allele2 when g2.Allele2 then 50 else 0 end as MatchPercentage
from cte_AllAnimals a1
join cte_AllAnimals a2
on a2.AnimalId > a1.AnimalId
cross join cte_AllMarkers m
left join genomes g1
on g1.AnimalId = a1.AnimalId
and g1.MarkerId = m.MarkerId
left join genomes g2
on g2.AnimalId = a2.AnimalId
and g2.MarkerId = m.MarkerId
order by a1.AnimalId,
a2.AnimalId,
m.MarkerId;
结果
AnimalIdPair MarkerId MatchPercentage
------------ -------- ---------------
1-2 OAR1 50
1-2 OAR2 100
1-2 OAR3 0
1-3 OAR1 100
1-3 OAR2 50
1-3 OAR3 0
2-3 OAR1 50
2-3 OAR2 50
2-3 OAR3 0
Fiddle 观看实际效果。
对于最小的、可重现的示例 (reprex),我们假设我在 Microsoft SQL 服务器中有一个数据库对象 (dbo),并且我想在 T-SQL.[= 中查询内容15=]
我的 dbo 看起来像这样:
Animal-ID Marker-ID Allele1 Allele2
--------------------------------------------
1 OAR1 A G
1 OAR2 C C
1 OAR3 T G
2 OAR1 A C
2 OAR2 C C
2 OAR3 A C
我想做的是计算所有动物 ID 中每个标记 ID 的等位基因匹配百分比。
给定上面的 dbo 示例,所需的结果如下所示:
Animal-ID-pair Marker-ID Match-percentage
--------------------------------------------
1-2 OAR1 50
1-2 OAR2 100
1-2 OAR3 0
到目前为止,我尝试了以下方法:
起初我认为选择单独的行就足够了。
SELECT *
FROM
(SELECT
ROW_NUMBER() OVER (ORDER BY Animal-ID ASC) AS rownumber,
Animal-ID, Marker-ID,
Allele1, Allele2
FROM
dbo) AS foo
WHERE
rownumber BETWEEN 1 AND 3;
然后将其与 4 到 6 之间的范围进行比较。
这里的问题是,在我的真实数据集中,并不是所有的动物 ID 对都具有相同的行数,即标记的数量不同。
这就是我认为分组可能有用的原因:
SELECT
Animal-ID, Marker-ID,
Allele1, Allele2
FROM
dbo
WHERE
Animal-ID IN (SELECT Animal-ID FROM dbo
GROUP BY Animal-ID
HAVING COUNT(*) > 1);
但这不允许我进行跨组比较 and/or 计算。
所以想请教一下行对比较中的一致度如何计算
通过使用 SUBQUERY & STUFF
DECLARE @T TABLE(Animal_ID INT, Marker_ID CHAR(10) , Allele1 CHAR, Allele2 CHAR)
INSERT INTO @T VALUES
(1,'OAR1','A','G'),
(1,'OAR2','C','C'),
(1,'OAR3','T','G'),
(2,'OAR1','A','C'),
(2,'OAR2','C','C'),
(2,'OAR3','A','C')
SELECT * FROM @T
SELECT S.*,(ISNULL(S1.C,0)+ISNULL(S2.C,0))*100/LEN(Allele_Pair) AS Percentage
FROM (
SELECT STUFF((SELECT CONCAT('-' , Animal_ID ) FROM @T t1
WHERE t1.Marker_ID = t2.Marker_ID FOR XML PATH ('')), 1, 1, '') AS Animal_ID_Pair
,Marker_ID,
STUFF((SELECT CONCAT(Allele1,Allele2) FROM @T t1
WHERE t1.Marker_ID = t2.Marker_ID FOR XML PATH ('')), 1, 0, '') AS Allele_Pair
FROM @T t2
GROUP BY Marker_ID) S
LEFT JOIN (SELECT Marker_ID,Allele2,COUNT(Allele2) AS C FROm @T GROUP BY Allele2,Marker_ID HAVING COUNT(Allele2)>1) S1 ON S1.Marker_ID=S.Marker_ID
LEFT JOIN (SELECT Marker_ID,Allele1,COUNT(Allele1) AS C FROm @T GROUP BY Allele1,Marker_ID HAVING COUNT(Allele1)>1) S2 ON S2.Marker_ID=S.Marker_ID
输出:
Animal_ID_Pair Marker_ID Allele_Pair Percentage
1-2 OAR1 AGAC 50
1-2 OAR2 CCCC 100
1-2 OAR3 TGAC 0
自连接做你想做的——用一些算术:
select t1.animal_id, t2.animal_id,
( case when t1.allele1 = t2.allele1 then 1.0 else 0 end +
case when t1.allele2 = t2.allele2 then 1.0 else 0 end +
) / 2.0 as match_percentage
from t t1 join
t t2
on t1.marker_id = t2.marker_id and
t1.animal_id < t2.animal_id;
尽管向其中添加新的等位基因很容易。您还可以通过取消旋转等位基因并聚合来表达这一点:
with ta as (
select t.*,, v.*
from t cross apply
(values (1, allele1), (2, allele2)) v(allele, val)
)
select ta1.animal_id, ta2.animal_id, mta1.marker,
avg(case when ta1.val = ta2.val then 1.0 else 0 end) as match_percentage
from ta ta1 join
ta ta2
on ta1.marker_id = ta2.marker_id and
ta1.animal_id < ta2.animal_id
group by ta1.animal_id, ta2.animal_id;
示例数据
create table genomes
(
AnimalId int,
MarkerId nvarchar(10),
Allele1 nvarchar(1),
Allele2 nvarchar(2)
)
insert into genomes (AnimalId, MarkerId, Allele1, Allele2) values
(1, 'OAR1', 'A', 'G'),
(1, 'OAR2', 'C', 'C'),
(1, 'OAR3', 'T', 'G'),
(2, 'OAR1', 'A', 'C'),
(2, 'OAR2', 'C', 'C'),
(2, 'OAR3', 'A', 'C'),
(3, 'OAR1', 'A', 'G'), --new sample Animal with less data (no OAR3)
(3, 'OAR2', 'C', 'G');
解决方案
- Select 所有独特的动物
cte_AllAnimals
. - Select 所有唯一标记
cte_AllMarkers
. - 将每只动物与其后面的每只动物合并
a2.AnimalId > a1.AnimalId
。这将为您提供所有独特的动物组合。 - 将每对与每个标记组合
cross join cte_AllMarkers
。
这给了我:
with cte_AllMarkers as
(
select g.MarkerId
from genomes g
group by g.MarkerId
),
cte_AllAnimals as
(
select g.AnimalId
from genomes g
group by g.AnimalId
)
select convert(nvarchar(10), a1.AnimalId) + '-' +
convert(nvarchar(10), a2.AnimalId) as AnimalIdPair,
m.MarkerId,
case g1.Allele1 when g2.Allele1 then 50 else 0 end +
case g1.Allele2 when g2.Allele2 then 50 else 0 end as MatchPercentage
from cte_AllAnimals a1
join cte_AllAnimals a2
on a2.AnimalId > a1.AnimalId
cross join cte_AllMarkers m
left join genomes g1
on g1.AnimalId = a1.AnimalId
and g1.MarkerId = m.MarkerId
left join genomes g2
on g2.AnimalId = a2.AnimalId
and g2.MarkerId = m.MarkerId
order by a1.AnimalId,
a2.AnimalId,
m.MarkerId;
结果
AnimalIdPair MarkerId MatchPercentage
------------ -------- ---------------
1-2 OAR1 50
1-2 OAR2 100
1-2 OAR3 0
1-3 OAR1 100
1-3 OAR2 50
1-3 OAR3 0
2-3 OAR1 50
2-3 OAR2 50
2-3 OAR3 0
Fiddle 观看实际效果。