合并具有多个变量的 2 个数据集(不能只使用相似的变量)
Merging 2 datasets with multiple variables (Cannot use just similar variables)
我正在尝试合并 2 个数据集(150,000 条记录和 50,000 条记录),每个数据集大约有 50 个变量,其中一些可能匹配。两个数据集中的一个共同变量是 'Incident date' 但我不能使用它,因为数据集有大约 300 起事件发生在那个特定日期(按地址、城市、县、邮编、时间紧急医疗服务细分) (EMS)已收到通知)。另一个数据集有事件发生的确切时间、地址、城市、县、邮政编码和其他一些字段,但如果信息不是 known/recorded,这些字段可能为空。
我想创建一个缓冲区来按每个字段连接数据集。例如,首先从事件发生的日期开始(没有缺失值),如果它们相同,下一步就是检查它们是否发生在同一个县市等。 (某些值可能为空)。比较的最后一个字段是 EMS 收到通知的时间(事件发生后最多 30 - 60 分钟)。如果所有字段都匹配,则最终缓冲时间为 30 - 60 分钟。这将是多对一的合并(50,000 到 150,000)。
哪个程序可以让我这样做?有一定的代码吗?
我添加了两个数据集的片段 (https://filedropper.com/filemanager/public.php?service=files&t=0f2d129b1622901fafc8c9e678433623&download) and (https://filedropper.com/filemanager/public.php?service=files&t=642c840bc3e431c3d4d839a71bb66944&download)
预期输出看起来像这样
使用的代码是:
T1 = readtable('dataset1.csv')
T2 = readtable('dataset2.csv')
LT1 = size(T1,1);
LT2 = size(T2,1);
T1 = [T1, cell2table(repmat({''}, LT1, 7),'VariableNames', {'County_Name', 'City_Name', 'Town_Name','CrashTime', 'SecondaryLocation', 'RouteName', 'PostalCityName'})]
augmented = false(LT1,1);
dtstr = 'MM/dd/yyyy HH:mm';
trange = duration([0,0,0;1,0,0]);
for tt2 = 1:LT2
cdate2 = T2.CrashDate{tt2};
crasht2 = T2.CrashDateTime{tt2};
assert(~isempty(cdate2) & ~isempty(crasht2),'Major data missing')
crashdt2 = [cdate2, ' ', crasht2];
crashdt2 = datetime(crashdt2,'InputFormat',dtstr);
strtaddr2 = T2.RouteName{tt2};
if ~isempty(strtaddr2)
strtaddr2 = upper(strtaddr2);
strtaddr2 = strrep(strtaddr2,'ROAD','RD');
strtaddr2 = strtaddr2(isletter(strtaddr2));
end
pcityn2 = T2.PostalCityName{tt2};
if ~isempty(pcityn2)
pcityn2 = upper(pcityn2);
pcityn2 = pcityn2(isletter(pcityn2));
end
countyn2 = T2.County_Name{tt2};
if ~isempty(countyn2)
countyn2 = countyn2(isletter(countyn2));
countyn2 = upper(countyn2);
countyn2 = strrep(countyn2,'COUNTY','');
end
for tt1 = 1:LT1
if augmented(tt1)
continue
end
matchvec = true(5,1);
cdate1 = T1.IncidentDate{tt1};
matchvec(1) = strcmp(cdate1, cdate2);
strtaddr1 = upper(T1.AddressStreet{tt1});
if ~isempty(strtaddr2) && ~isempty(strtaddr1)
strtaddr1 = strrep(strtaddr1,'ROAD','RD');
strtaddr1 = strtaddr1(isletter(strtaddr1));
matchvec(2) = strcmp(strtaddr1,strtaddr2);
end
pcityn1 = upper(T1.AddressCityIncident{tt1});
pcityn1 = pcityn1(isletter(pcityn1));
if ~isempty(pcityn2) && ~isempty(pcityn1)
pcityn1 = pcityn1(isletter(pcityn1));
matchvec(3) = strcmp(pcityn1,pcityn2);
end
countyn1 = upper(T1.AddressCountyIncident{tt1});
countyn1 = countyn1(isletter(countyn1));
if ~isempty(countyn2) && ~isempty(countyn1)
countyn1 = countyn1(isletter(countyn1));
matchvec(4) = strcmp(countyn1,countyn2);
end
crashdt1u = T1.UnitNotified{tt1};
crashdt1d = T1.Date12_DispatchNotified{tt1};
if ~isempty(crashdt1u) || ~isempty(crashdt1d)
tmatch = true(2,1);
if ~isempty(crashdt1u)
crashdt1u = datetime(crashdt1u,'InputFormat',dtstr);
difcrdt1d = crashdt1d-crashdt2;
tmatch = difcrdt1d >= trange(1) && difcrdt1d <= trange(2);
end
matchvec(5) = all(tmatch);
end
if all(matchvec)
T1{tt1,{'County_Name', 'City_Name', 'Town_Name','CrashTime', 'SecondaryLocation', 'RouteName','PostalCityName'}} = table2cell( T2(tt2,{'County_Name', 'City_Name','Town_Name', 'CrashTime', 'SecondaryLocation','RouteName', 'PostalCityName'}) );
augmented(tt1)=true;
else
T1(tt1,:)
T2(tt2,:)
matchvec
end
end
end
T1
编辑:优化代码以提高性能;预计大量数据。
OP 注意事项:您的原始数据有很多错误。 csv 文件中的实际数据中的任何地方都不允许使用逗号。某些字符串(发现 1 个单位通知时间)没有预定义格式。 try
块处理一种特殊情况;如果所有字段都存在缺陷数据,则应在所有字段中实施 try
。所有这些都应该在合并之前解决。
clear;clc;close all
T1 = readtable('dataset1.csv');
T2 = readtable('dataset2.csv');
T1 = T1(1:1000,:);
T2 = T2(1:900,:);
LT1 = size(T1,1);
LT2 = size(T2,1);
% expand T1 for expansion
T1 = [T1, cell2table(repmat({''}, LT1, 7), ....
'VariableNames', {'County_Name', 'City_Name', 'Town_Name', ....
'CrashTime', 'SecondaryLocation', 'RouteName', 'PostalCityName'})];
augmented = false(LT1,1); % see usage below
dtstr = 'MM/dd/yyyy HH:mm';
trange = duration([-1,0,0;1,0,0]); % 0 to 1 hour
strtaddrcmpf = @(c1,c2) cellfun(@(s2) ....
cellfun(@(s1) ....
~(isempty(strfind(s1,s2)) | isempty(strfind(s2,s1))), ....
c1), ....
c2);
% buffer original to speed up
fprintf('Pre-processing started at %s \n', datestr(datetime('now')))
T1B = cell2table([repmat({''}, LT1, 5), repmat({true}, LT1, 4)], ....
'VariableNames', {'CrashDTU','CrashDTD', ....
'StrtAdd','PoCityN', 'CountyN', ....
'CrashDTFlg', 'StrtAddFlg', 'PoCityNFlg', 'CountyNFlg'});
T2B = cell2table([repmat({''}, LT2, 4), repmat({true}, LT2, 3)], ....
'VariableNames', {'CrashDT', 'StrtAdd', 'PoCityN', 'CountyN', ....
'StrtAddFlg', 'PoCityNFlg', 'CountyNFlg'});
fprintf('Progress: ')
for tt2 = 1:LT2
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Progress: %6.2f%%', tt2/LT2*50);
cdate2 = T2.CrashDate{tt2};
crasht2 = T2.CrashTime{tt2};
assert(~isempty(cdate2) & ~isempty(crasht2),'Major data missing')
crashdt2 = [cdate2, ' ', crasht2];
T2B.CrashDT{tt2} = datetime(crashdt2,'InputFormat',dtstr);
strtaddr2 = T2.RouteName{tt2};
if ~isempty(strtaddr2)
strtaddr2 = upper(strtaddr2);
strtaddr2 = strrep(strtaddr2,'ROAD','RD'); % repeat for HWY ST etc
strtaddr2 = strsplit(strtaddr2,'/');
switch true
case strfind(strtaddr2,'I95')
strtaddr2 = {'I95'};
case strfind(strtaddr2,'I495')
strtaddr2 = {'I495'};
otherwise
strtaddr2 = cellfun(@(s) s(isletter(s)), ....
strtaddr2, 'Uniform',false);
end
T2B.StrtAdd{tt2} = strtaddr2;
else
T2B.StrtAddFlg(tt2) = false;
end
pcityn2 = T2.PostalCityName{tt2};
if ~isempty(pcityn2)
pcityn2 = upper(pcityn2);
pcityn2 = pcityn2(isletter(pcityn2));
T2B.PoCityN{tt2} = pcityn2;
else
T2B.PoCityNFlg(tt2) = false;
end
countyn2 = T2.County_Name{tt2};
if ~isempty(countyn2)
countyn2 = upper(countyn2);
countyn2 = countyn2(isletter(countyn2));
countyn2 = strrep(countyn2,'COUNTY','');
T2B.CountyN{tt2} = countyn2;
else
T2B.CountyNFlg(tt2) = false;
end
end
for tt1 = 1:LT1
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Progress: %6.2f%%', tt1/LT1*50+50);
strtaddr1 = upper(T1.AddressStreet{tt1});
if ~isempty(strtaddr1)
strtaddr1 = strrep(strtaddr1,'ROAD','RD');
strtaddr1 = strsplit(strtaddr1,'/');
switch true
case strfind(strtaddr1,'I95')
strtaddr1 = {'I95'};
case strfind(strtaddr1,'I495')
strtaddr1 = {'I495'};
otherwise
strtaddr1 = cellfun(@(s) s(isletter(s)), ....
strtaddr1, 'Uniform',false);
end
T1B.StrtAdd{tt1} = strtaddr1;
else
T1B.StrtAddFlg(tt1) = false;
end
pcityn1 = upper(T1.AddressCityIncident{tt1});
if ~isempty(pcityn1)
pcityn1 = pcityn1(isletter(pcityn1));
T1B.PoCityN{tt1} = pcityn1;
else
T1B.PoCityNFlg(tt1) = false;
end
countyn1 = upper(T1.AddressCountyIncident{tt1});
if ~isempty(countyn1)
countyn1 = countyn1(isletter(countyn1));
countyn1 = strrep(countyn1,'COUNTY','');
T1B.CountyN{tt1} = countyn1;
else
T1B.CountyNFlg(tt1) = false;
end
crashdt1u = T1.UnitNotified{tt1};
crashdt1d = T1.DispatchNotified{tt1};
if ~isempty(crashdt1u) || ~isempty(crashdt1d)
tmatch = true(2,1);
% a little dirty here, need both date and time
try
if ~isempty(crashdt1u)
crashdt1u = datetime(crashdt1u,'InputFormat',dtstr);
T1B.CrashDTU{tt1} = crashdt1u;
end
if ~isempty(crashdt1d)
crashdt1d = datetime(crashdt1d,'InputFormat',dtstr);
T1B.CrashDTD{tt1} = crashdt1d;
end
catch
T1B.CrashDTFlg(tt1) = false;
end
else
T1B.CrashDTFlg(tt1) = false;
end
end
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Pre-processing finished at %s \n', ....
datestr(datetime('now')))
fprintf('Matching started at %s \n', datestr(datetime('now')))
% process data
fprintf('Progress: ')
for tt2 = 1:LT2
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Progress: %6.2f%%', tt2/LT2*100);
% extract a row for comparison
crashdt2 = T2B.CrashDT{tt2};
strtaddr2 = T2B.StrtAdd{tt2};
pcityn2 = T2B.PoCityN{tt2};
countyn2 = T2B.CountyN{tt2};
for tt1 = 1:LT1
if augmented(tt1) % match already found, skip
continue
end
% Boolean comparison: treat missing data as identical
cdate1 = T1.IncidentDate{tt1};
match1 = strcmp(cdate1, cdate2); % incident date
if ~match1
continue
end
if T2B.StrtAddFlg(tt2) && T1B.StrtAddFlg(tt1) % put 2 first: faster
strtaddr1 = T1B.StrtAdd{tt1};
strtaddr_cmp = strtaddrcmpf(strtaddr2,strtaddr1);
match2 = any(strtaddr_cmp); % street name match
end
if ~match2
continue
end
if T2B.PoCityNFlg(tt2) && T1B.PoCityNFlg(tt1)
pcityn1 = T1B.PoCityN{tt1};
match3 = strcmp(pcityn1,pcityn2); % postal city name match
end
if ~match3
continue
end
if T2B.CountyNFlg(tt2) && T1B.CountyNFlg(tt1)
countyn1 = T1B.CountyN{tt1};
countyn1 = countyn1(isletter(countyn1));
countyn1 = strrep(countyn1,'COUNTY','');
match4 = strcmp(countyn1,countyn2); % county name match
end
if ~match4
continue
end
if T1B.CrashDTFlg(tt1)
crashdt1u = T1B.CrashDTU{tt1};
crashdt1d = T1B.CrashDTD{tt1};
% a little dirty here, need both date and time
if ~isempty(crashdt1u)
difcrdt1u = crashdt1u-crashdt2;
tmatch1 = difcrdt1u >= trange(1) && difcrdt1u <= trange(2);
end
if ~isempty(crashdt1d)
difcrdt1d = crashdt1d-crashdt2;
tmatch2 = difcrdt1d >= trange(1) && difcrdt1d <= trange(2);
end
match5 = tmatch1 & tmatch2;
end
if ~match5
continue
end
% append row in T2 to T1
T1{tt1,{'County_Name', 'City_Name', 'Town_Name', ....
'CrashTime', 'SecondaryLocation', 'RouteName', ....
'PostalCityName'}} = ....
table2cell( T2(tt2,{'County_Name', 'City_Name', ....
'Town_Name', 'CrashTime', 'SecondaryLocation', ....
'RouteName', 'PostalCityName'}) );
augmented(tt1) = true;
% break % assume unique matching
end
end
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Matching finished at %s \nTotalling %d matches. \n', ....
datestr(datetime('now')), sum(augmented))
编辑:随着OP新上传的数据集,涵盖了更多案例。
'GEORGETOWN PIKE/CENTRILLION DR'
等交叉路口应与 'GEORGETOWN PIKE'
或 'CENTRILLION DR'
匹配。
- 州际公路名称,如
'I95'
,其名称中包含数字,应与街道号码区分开来。
- 州际公路名称有时包含应忽略的详细位置。 (并查看其他信息)
添加了进度显示。
编辑:我忘记使用 augmented
记录来加快速度。此外,在最后添加了调试部分,以便查看匹配过程中哪些条件不满足。
这是一个在 Matlab 中使用 table
class 的解决方案。由于这是一项相当新的功能,因此在不同版本的 Matlab 中进行编程可能会有所不同。我正在使用 R2015b。
要点:
- 对于数据集 2 中的每一行,查找数据集 1 中所有行的匹配项。
- 如果记录的任何内容不匹配,请跳过。除此以外,
认为它们属于同一事件。
- 将数据集 2 中的其他内容附加到 1。
带有注释的示例代码:
(obsolete)
我从 Matlab 收到这条消息
Warning: Variable names were modified to make them valid MATLAB
identifiers.
因此您可能需要根据需要更改表中的列名。
这些是从您的 csv 文件导入的原始数据集
(obsolete)
示例输出:
(obsolete)
新数据集和输出:
>> T1
T1 =
IncidentDate AddressStreet AddressCityIncident AddressCountyIncident AddressState IncidentPostalCode DispatchNotified UnitNotified
____________ ___________________________________________ ___________________ _____________________ ____________ __________________ ________________ ________________
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33'
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33'
'1/1/2014' 'I95 SB TO OLD KEENE MILL RD' 'SPRINGFIELD' 'Fairfax County' 'VA' 22150 '1/1/2014 2:00' '1/1/2014 2:00'
'1/1/2014' 'SYDENSTRICKER RD/OLD KEENE MILL RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 4:54' '1/1/2014 4:54'
'1/1/2014' 'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB' 'CHANTILLY' 'Fairfax County' 'VA' 20151 '1/1/2014 12:28' '1/1/2014 12:28'
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07'
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07'
'1/1/2014' 'CENTREVILLE RD/BRADENTON DR' 'CENTREVILLE' 'Fairfax County' 'VA' 20121 '1/1/2014 13:41' '1/1/2014 13:41'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:45' '1/1/2014 16:45'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:42' '1/1/2014 16:42'
'1/1/2014' '8526 GEORGETOWN PIKE' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:49' '1/1/2014 16:49'
'1/1/2014' 'OX RD/BRADDOCK RD' 'FAIRFAX' 'Fairfax County' 'VA' 22032 '1/1/2014 22:32' '1/1/2014 22:32'
>> T2
T2 =
CrashDate County_Name City_Name Town_Name CrashTime SecondaryLocation RouteName PostalCityName
__________ ________________ _________ _________ _________ __________________________ ___________________ ______________
'1/1/2014' 'Fairfax County' NaN NaN '6:35' '' 'I95' 'LORTON'
'1/1/2014' 'Fairfax County' NaN NaN '5:19' '' 'I95 RAMP' 'SPRINGFIELD'
'1/1/2014' 'Fairfax County' NaN NaN '10:23' '' 'I495' 'ANNANDALE'
'1/1/2014' 'Fairfax County' NaN NaN '2:08' '' 'BUILDERS RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'Fairfax County' NaN NaN '20:55' 'LEESBURG PIKE' 'WILSON BLVD' 'FALLS CHURCH'
'1/1/2014' 'Fairfax County' NaN NaN '4:54' '' 'SYDENSTRICKER RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '2:34' 'BEACON HILL RD' 'RICHMOND HWY' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '2:00' '' 'COAT RIDGE RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '13:17' '' 'OLD KEENE MILL RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '5:19' 'MCLEAREN RD' 'CENTREVILLE RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '21:48' 'VIRGINIA CENTER BLVD' 'VADEN DR' 'VIENNA'
'1/1/2014' 'Fairfax County' NaN NaN '19:59' 'FAIRFAX COUNTY PKWY RAMP' 'LEE HWY RAMP' 'FAIRFAX'
'1/1/2014' 'Fairfax County' NaN NaN '2:36' '' 'I95' 'SPRINGFIELD'
'1/1/2014' 'Fairfax County' NaN NaN '20:36' 'MOUNT GILEAD RD' 'BRADDOCK RD' 'CENTREVILLE'
'1/1/2014' 'Fairfax County' NaN NaN '1:46' '' 'I95' 'LORTON'
'1/1/2014' 'Fairfax County' NaN NaN '18:45' '' 'I495' 'HAMPTON'
'1/1/2014' 'Fairfax County' NaN NaN '13:40' 'BRADENTON DR' 'CENTREVILLE RD' 'CENTREVILLE'
'1/1/2014' 'Fairfax County' NaN NaN '17:24' 'SHREVE HILL RD' 'IDYLWOOD RD' 'DUNN LORING'
'1/1/2014' 'Fairfax County' NaN NaN '17:46' 'SACRAMENTO DR' 'RICHMOND HWY' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '1:40' '' 'WINBOURNE RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '15:44' 'TELEGRAPH RD' 'FRANCONIA RD' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '22:19' 'OX RD' 'BRADDOCK RD' 'FAIRFAX'
'1/1/2014' 'Fairfax County' NaN NaN '12:27' '' 'SULLY RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '11:25' 'MONUMENT DR' 'LEE HWY' 'FAIRFAX'
T1 =
IncidentDate AddressStreet AddressCityIncident AddressCountyIncident AddressState IncidentPostalCode DispatchNotified UnitNotified County_Name City_Name Town_Name CrashTime SecondaryLocation RouteName PostalCityName
____________ ___________________________________________ ___________________ _____________________ ____________ __________________ ________________ ________________ ________________ _________ _________ _________ _________________ __________________ ______________
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33' 'Fairfax County' [NaN] [NaN] '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33' 'Fairfax County' [NaN] [NaN] '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'I95 SB TO OLD KEENE MILL RD' 'SPRINGFIELD' 'Fairfax County' 'VA' 22150 '1/1/2014 2:00' '1/1/2014 2:00' '' '' '' '' '' '' ''
'1/1/2014' 'SYDENSTRICKER RD/OLD KEENE MILL RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 4:54' '1/1/2014 4:54' 'Fairfax County' [NaN] [NaN] '4:54' '' 'SYDENSTRICKER RD' 'BURKE'
'1/1/2014' 'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB' 'CHANTILLY' 'Fairfax County' 'VA' 20151 '1/1/2014 12:28' '1/1/2014 12:28' '' '' '' '' '' '' ''
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07' '' '' '' '' '' '' ''
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07' '' '' '' '' '' '' ''
'1/1/2014' 'CENTREVILLE RD/BRADENTON DR' 'CENTREVILLE' 'Fairfax County' 'VA' 20121 '1/1/2014 13:41' '1/1/2014 13:41' 'Fairfax County' [NaN] [NaN] '13:40' 'BRADENTON DR' 'CENTREVILLE RD' 'CENTREVILLE'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:45' '1/1/2014 16:45' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:42' '1/1/2014 16:42' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' '8526 GEORGETOWN PIKE' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:49' '1/1/2014 16:49' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'OX RD/BRADDOCK RD' 'FAIRFAX' 'Fairfax County' 'VA' 22032 '1/1/2014 22:32' '1/1/2014 22:32' 'Fairfax County' [NaN] [NaN] '22:19' 'OX RD' 'BRADDOCK RD' 'FAIRFAX'
我正在尝试合并 2 个数据集(150,000 条记录和 50,000 条记录),每个数据集大约有 50 个变量,其中一些可能匹配。两个数据集中的一个共同变量是 'Incident date' 但我不能使用它,因为数据集有大约 300 起事件发生在那个特定日期(按地址、城市、县、邮编、时间紧急医疗服务细分) (EMS)已收到通知)。另一个数据集有事件发生的确切时间、地址、城市、县、邮政编码和其他一些字段,但如果信息不是 known/recorded,这些字段可能为空。
我想创建一个缓冲区来按每个字段连接数据集。例如,首先从事件发生的日期开始(没有缺失值),如果它们相同,下一步就是检查它们是否发生在同一个县市等。 (某些值可能为空)。比较的最后一个字段是 EMS 收到通知的时间(事件发生后最多 30 - 60 分钟)。如果所有字段都匹配,则最终缓冲时间为 30 - 60 分钟。这将是多对一的合并(50,000 到 150,000)。
哪个程序可以让我这样做?有一定的代码吗?
我添加了两个数据集的片段 (https://filedropper.com/filemanager/public.php?service=files&t=0f2d129b1622901fafc8c9e678433623&download) and (https://filedropper.com/filemanager/public.php?service=files&t=642c840bc3e431c3d4d839a71bb66944&download)
预期输出看起来像这样
使用的代码是:
T1 = readtable('dataset1.csv')
T2 = readtable('dataset2.csv')
LT1 = size(T1,1);
LT2 = size(T2,1);
T1 = [T1, cell2table(repmat({''}, LT1, 7),'VariableNames', {'County_Name', 'City_Name', 'Town_Name','CrashTime', 'SecondaryLocation', 'RouteName', 'PostalCityName'})]
augmented = false(LT1,1);
dtstr = 'MM/dd/yyyy HH:mm';
trange = duration([0,0,0;1,0,0]);
for tt2 = 1:LT2
cdate2 = T2.CrashDate{tt2};
crasht2 = T2.CrashDateTime{tt2};
assert(~isempty(cdate2) & ~isempty(crasht2),'Major data missing')
crashdt2 = [cdate2, ' ', crasht2];
crashdt2 = datetime(crashdt2,'InputFormat',dtstr);
strtaddr2 = T2.RouteName{tt2};
if ~isempty(strtaddr2)
strtaddr2 = upper(strtaddr2);
strtaddr2 = strrep(strtaddr2,'ROAD','RD');
strtaddr2 = strtaddr2(isletter(strtaddr2));
end
pcityn2 = T2.PostalCityName{tt2};
if ~isempty(pcityn2)
pcityn2 = upper(pcityn2);
pcityn2 = pcityn2(isletter(pcityn2));
end
countyn2 = T2.County_Name{tt2};
if ~isempty(countyn2)
countyn2 = countyn2(isletter(countyn2));
countyn2 = upper(countyn2);
countyn2 = strrep(countyn2,'COUNTY','');
end
for tt1 = 1:LT1
if augmented(tt1)
continue
end
matchvec = true(5,1);
cdate1 = T1.IncidentDate{tt1};
matchvec(1) = strcmp(cdate1, cdate2);
strtaddr1 = upper(T1.AddressStreet{tt1});
if ~isempty(strtaddr2) && ~isempty(strtaddr1)
strtaddr1 = strrep(strtaddr1,'ROAD','RD');
strtaddr1 = strtaddr1(isletter(strtaddr1));
matchvec(2) = strcmp(strtaddr1,strtaddr2);
end
pcityn1 = upper(T1.AddressCityIncident{tt1});
pcityn1 = pcityn1(isletter(pcityn1));
if ~isempty(pcityn2) && ~isempty(pcityn1)
pcityn1 = pcityn1(isletter(pcityn1));
matchvec(3) = strcmp(pcityn1,pcityn2);
end
countyn1 = upper(T1.AddressCountyIncident{tt1});
countyn1 = countyn1(isletter(countyn1));
if ~isempty(countyn2) && ~isempty(countyn1)
countyn1 = countyn1(isletter(countyn1));
matchvec(4) = strcmp(countyn1,countyn2);
end
crashdt1u = T1.UnitNotified{tt1};
crashdt1d = T1.Date12_DispatchNotified{tt1};
if ~isempty(crashdt1u) || ~isempty(crashdt1d)
tmatch = true(2,1);
if ~isempty(crashdt1u)
crashdt1u = datetime(crashdt1u,'InputFormat',dtstr);
difcrdt1d = crashdt1d-crashdt2;
tmatch = difcrdt1d >= trange(1) && difcrdt1d <= trange(2);
end
matchvec(5) = all(tmatch);
end
if all(matchvec)
T1{tt1,{'County_Name', 'City_Name', 'Town_Name','CrashTime', 'SecondaryLocation', 'RouteName','PostalCityName'}} = table2cell( T2(tt2,{'County_Name', 'City_Name','Town_Name', 'CrashTime', 'SecondaryLocation','RouteName', 'PostalCityName'}) );
augmented(tt1)=true;
else
T1(tt1,:)
T2(tt2,:)
matchvec
end
end
end
T1
编辑:优化代码以提高性能;预计大量数据。
OP 注意事项:您的原始数据有很多错误。 csv 文件中的实际数据中的任何地方都不允许使用逗号。某些字符串(发现 1 个单位通知时间)没有预定义格式。 try
块处理一种特殊情况;如果所有字段都存在缺陷数据,则应在所有字段中实施 try
。所有这些都应该在合并之前解决。
clear;clc;close all
T1 = readtable('dataset1.csv');
T2 = readtable('dataset2.csv');
T1 = T1(1:1000,:);
T2 = T2(1:900,:);
LT1 = size(T1,1);
LT2 = size(T2,1);
% expand T1 for expansion
T1 = [T1, cell2table(repmat({''}, LT1, 7), ....
'VariableNames', {'County_Name', 'City_Name', 'Town_Name', ....
'CrashTime', 'SecondaryLocation', 'RouteName', 'PostalCityName'})];
augmented = false(LT1,1); % see usage below
dtstr = 'MM/dd/yyyy HH:mm';
trange = duration([-1,0,0;1,0,0]); % 0 to 1 hour
strtaddrcmpf = @(c1,c2) cellfun(@(s2) ....
cellfun(@(s1) ....
~(isempty(strfind(s1,s2)) | isempty(strfind(s2,s1))), ....
c1), ....
c2);
% buffer original to speed up
fprintf('Pre-processing started at %s \n', datestr(datetime('now')))
T1B = cell2table([repmat({''}, LT1, 5), repmat({true}, LT1, 4)], ....
'VariableNames', {'CrashDTU','CrashDTD', ....
'StrtAdd','PoCityN', 'CountyN', ....
'CrashDTFlg', 'StrtAddFlg', 'PoCityNFlg', 'CountyNFlg'});
T2B = cell2table([repmat({''}, LT2, 4), repmat({true}, LT2, 3)], ....
'VariableNames', {'CrashDT', 'StrtAdd', 'PoCityN', 'CountyN', ....
'StrtAddFlg', 'PoCityNFlg', 'CountyNFlg'});
fprintf('Progress: ')
for tt2 = 1:LT2
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Progress: %6.2f%%', tt2/LT2*50);
cdate2 = T2.CrashDate{tt2};
crasht2 = T2.CrashTime{tt2};
assert(~isempty(cdate2) & ~isempty(crasht2),'Major data missing')
crashdt2 = [cdate2, ' ', crasht2];
T2B.CrashDT{tt2} = datetime(crashdt2,'InputFormat',dtstr);
strtaddr2 = T2.RouteName{tt2};
if ~isempty(strtaddr2)
strtaddr2 = upper(strtaddr2);
strtaddr2 = strrep(strtaddr2,'ROAD','RD'); % repeat for HWY ST etc
strtaddr2 = strsplit(strtaddr2,'/');
switch true
case strfind(strtaddr2,'I95')
strtaddr2 = {'I95'};
case strfind(strtaddr2,'I495')
strtaddr2 = {'I495'};
otherwise
strtaddr2 = cellfun(@(s) s(isletter(s)), ....
strtaddr2, 'Uniform',false);
end
T2B.StrtAdd{tt2} = strtaddr2;
else
T2B.StrtAddFlg(tt2) = false;
end
pcityn2 = T2.PostalCityName{tt2};
if ~isempty(pcityn2)
pcityn2 = upper(pcityn2);
pcityn2 = pcityn2(isletter(pcityn2));
T2B.PoCityN{tt2} = pcityn2;
else
T2B.PoCityNFlg(tt2) = false;
end
countyn2 = T2.County_Name{tt2};
if ~isempty(countyn2)
countyn2 = upper(countyn2);
countyn2 = countyn2(isletter(countyn2));
countyn2 = strrep(countyn2,'COUNTY','');
T2B.CountyN{tt2} = countyn2;
else
T2B.CountyNFlg(tt2) = false;
end
end
for tt1 = 1:LT1
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Progress: %6.2f%%', tt1/LT1*50+50);
strtaddr1 = upper(T1.AddressStreet{tt1});
if ~isempty(strtaddr1)
strtaddr1 = strrep(strtaddr1,'ROAD','RD');
strtaddr1 = strsplit(strtaddr1,'/');
switch true
case strfind(strtaddr1,'I95')
strtaddr1 = {'I95'};
case strfind(strtaddr1,'I495')
strtaddr1 = {'I495'};
otherwise
strtaddr1 = cellfun(@(s) s(isletter(s)), ....
strtaddr1, 'Uniform',false);
end
T1B.StrtAdd{tt1} = strtaddr1;
else
T1B.StrtAddFlg(tt1) = false;
end
pcityn1 = upper(T1.AddressCityIncident{tt1});
if ~isempty(pcityn1)
pcityn1 = pcityn1(isletter(pcityn1));
T1B.PoCityN{tt1} = pcityn1;
else
T1B.PoCityNFlg(tt1) = false;
end
countyn1 = upper(T1.AddressCountyIncident{tt1});
if ~isempty(countyn1)
countyn1 = countyn1(isletter(countyn1));
countyn1 = strrep(countyn1,'COUNTY','');
T1B.CountyN{tt1} = countyn1;
else
T1B.CountyNFlg(tt1) = false;
end
crashdt1u = T1.UnitNotified{tt1};
crashdt1d = T1.DispatchNotified{tt1};
if ~isempty(crashdt1u) || ~isempty(crashdt1d)
tmatch = true(2,1);
% a little dirty here, need both date and time
try
if ~isempty(crashdt1u)
crashdt1u = datetime(crashdt1u,'InputFormat',dtstr);
T1B.CrashDTU{tt1} = crashdt1u;
end
if ~isempty(crashdt1d)
crashdt1d = datetime(crashdt1d,'InputFormat',dtstr);
T1B.CrashDTD{tt1} = crashdt1d;
end
catch
T1B.CrashDTFlg(tt1) = false;
end
else
T1B.CrashDTFlg(tt1) = false;
end
end
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Pre-processing finished at %s \n', ....
datestr(datetime('now')))
fprintf('Matching started at %s \n', datestr(datetime('now')))
% process data
fprintf('Progress: ')
for tt2 = 1:LT2
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Progress: %6.2f%%', tt2/LT2*100);
% extract a row for comparison
crashdt2 = T2B.CrashDT{tt2};
strtaddr2 = T2B.StrtAdd{tt2};
pcityn2 = T2B.PoCityN{tt2};
countyn2 = T2B.CountyN{tt2};
for tt1 = 1:LT1
if augmented(tt1) % match already found, skip
continue
end
% Boolean comparison: treat missing data as identical
cdate1 = T1.IncidentDate{tt1};
match1 = strcmp(cdate1, cdate2); % incident date
if ~match1
continue
end
if T2B.StrtAddFlg(tt2) && T1B.StrtAddFlg(tt1) % put 2 first: faster
strtaddr1 = T1B.StrtAdd{tt1};
strtaddr_cmp = strtaddrcmpf(strtaddr2,strtaddr1);
match2 = any(strtaddr_cmp); % street name match
end
if ~match2
continue
end
if T2B.PoCityNFlg(tt2) && T1B.PoCityNFlg(tt1)
pcityn1 = T1B.PoCityN{tt1};
match3 = strcmp(pcityn1,pcityn2); % postal city name match
end
if ~match3
continue
end
if T2B.CountyNFlg(tt2) && T1B.CountyNFlg(tt1)
countyn1 = T1B.CountyN{tt1};
countyn1 = countyn1(isletter(countyn1));
countyn1 = strrep(countyn1,'COUNTY','');
match4 = strcmp(countyn1,countyn2); % county name match
end
if ~match4
continue
end
if T1B.CrashDTFlg(tt1)
crashdt1u = T1B.CrashDTU{tt1};
crashdt1d = T1B.CrashDTD{tt1};
% a little dirty here, need both date and time
if ~isempty(crashdt1u)
difcrdt1u = crashdt1u-crashdt2;
tmatch1 = difcrdt1u >= trange(1) && difcrdt1u <= trange(2);
end
if ~isempty(crashdt1d)
difcrdt1d = crashdt1d-crashdt2;
tmatch2 = difcrdt1d >= trange(1) && difcrdt1d <= trange(2);
end
match5 = tmatch1 & tmatch2;
end
if ~match5
continue
end
% append row in T2 to T1
T1{tt1,{'County_Name', 'City_Name', 'Town_Name', ....
'CrashTime', 'SecondaryLocation', 'RouteName', ....
'PostalCityName'}} = ....
table2cell( T2(tt2,{'County_Name', 'City_Name', ....
'Town_Name', 'CrashTime', 'SecondaryLocation', ....
'RouteName', 'PostalCityName'}) );
augmented(tt1) = true;
% break % assume unique matching
end
end
fprintf('%s',repmat(sprintf('\b'),1,length('Progress: ')))
fprintf('Matching finished at %s \nTotalling %d matches. \n', ....
datestr(datetime('now')), sum(augmented))
编辑:随着OP新上传的数据集,涵盖了更多案例。
'GEORGETOWN PIKE/CENTRILLION DR'
等交叉路口应与'GEORGETOWN PIKE'
或'CENTRILLION DR'
匹配。- 州际公路名称,如
'I95'
,其名称中包含数字,应与街道号码区分开来。 - 州际公路名称有时包含应忽略的详细位置。 (并查看其他信息)
添加了进度显示。
编辑:我忘记使用 augmented
记录来加快速度。此外,在最后添加了调试部分,以便查看匹配过程中哪些条件不满足。
这是一个在 Matlab 中使用 table
class 的解决方案。由于这是一项相当新的功能,因此在不同版本的 Matlab 中进行编程可能会有所不同。我正在使用 R2015b。
要点:
- 对于数据集 2 中的每一行,查找数据集 1 中所有行的匹配项。
- 如果记录的任何内容不匹配,请跳过。除此以外, 认为它们属于同一事件。
- 将数据集 2 中的其他内容附加到 1。
带有注释的示例代码:
(obsolete)
我从 Matlab 收到这条消息
Warning: Variable names were modified to make them valid MATLAB identifiers.
因此您可能需要根据需要更改表中的列名。
这些是从您的 csv 文件导入的原始数据集
(obsolete)
示例输出:
(obsolete)
新数据集和输出:
>> T1
T1 =
IncidentDate AddressStreet AddressCityIncident AddressCountyIncident AddressState IncidentPostalCode DispatchNotified UnitNotified
____________ ___________________________________________ ___________________ _____________________ ____________ __________________ ________________ ________________
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33'
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33'
'1/1/2014' 'I95 SB TO OLD KEENE MILL RD' 'SPRINGFIELD' 'Fairfax County' 'VA' 22150 '1/1/2014 2:00' '1/1/2014 2:00'
'1/1/2014' 'SYDENSTRICKER RD/OLD KEENE MILL RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 4:54' '1/1/2014 4:54'
'1/1/2014' 'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB' 'CHANTILLY' 'Fairfax County' 'VA' 20151 '1/1/2014 12:28' '1/1/2014 12:28'
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07'
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07'
'1/1/2014' 'CENTREVILLE RD/BRADENTON DR' 'CENTREVILLE' 'Fairfax County' 'VA' 20121 '1/1/2014 13:41' '1/1/2014 13:41'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:45' '1/1/2014 16:45'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:42' '1/1/2014 16:42'
'1/1/2014' '8526 GEORGETOWN PIKE' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:49' '1/1/2014 16:49'
'1/1/2014' 'OX RD/BRADDOCK RD' 'FAIRFAX' 'Fairfax County' 'VA' 22032 '1/1/2014 22:32' '1/1/2014 22:32'
>> T2
T2 =
CrashDate County_Name City_Name Town_Name CrashTime SecondaryLocation RouteName PostalCityName
__________ ________________ _________ _________ _________ __________________________ ___________________ ______________
'1/1/2014' 'Fairfax County' NaN NaN '6:35' '' 'I95' 'LORTON'
'1/1/2014' 'Fairfax County' NaN NaN '5:19' '' 'I95 RAMP' 'SPRINGFIELD'
'1/1/2014' 'Fairfax County' NaN NaN '10:23' '' 'I495' 'ANNANDALE'
'1/1/2014' 'Fairfax County' NaN NaN '2:08' '' 'BUILDERS RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'Fairfax County' NaN NaN '20:55' 'LEESBURG PIKE' 'WILSON BLVD' 'FALLS CHURCH'
'1/1/2014' 'Fairfax County' NaN NaN '4:54' '' 'SYDENSTRICKER RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '2:34' 'BEACON HILL RD' 'RICHMOND HWY' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '2:00' '' 'COAT RIDGE RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '13:17' '' 'OLD KEENE MILL RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '5:19' 'MCLEAREN RD' 'CENTREVILLE RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '21:48' 'VIRGINIA CENTER BLVD' 'VADEN DR' 'VIENNA'
'1/1/2014' 'Fairfax County' NaN NaN '19:59' 'FAIRFAX COUNTY PKWY RAMP' 'LEE HWY RAMP' 'FAIRFAX'
'1/1/2014' 'Fairfax County' NaN NaN '2:36' '' 'I95' 'SPRINGFIELD'
'1/1/2014' 'Fairfax County' NaN NaN '20:36' 'MOUNT GILEAD RD' 'BRADDOCK RD' 'CENTREVILLE'
'1/1/2014' 'Fairfax County' NaN NaN '1:46' '' 'I95' 'LORTON'
'1/1/2014' 'Fairfax County' NaN NaN '18:45' '' 'I495' 'HAMPTON'
'1/1/2014' 'Fairfax County' NaN NaN '13:40' 'BRADENTON DR' 'CENTREVILLE RD' 'CENTREVILLE'
'1/1/2014' 'Fairfax County' NaN NaN '17:24' 'SHREVE HILL RD' 'IDYLWOOD RD' 'DUNN LORING'
'1/1/2014' 'Fairfax County' NaN NaN '17:46' 'SACRAMENTO DR' 'RICHMOND HWY' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '1:40' '' 'WINBOURNE RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'Fairfax County' NaN NaN '15:44' 'TELEGRAPH RD' 'FRANCONIA RD' 'ALEXANDRIA'
'1/1/2014' 'Fairfax County' NaN NaN '22:19' 'OX RD' 'BRADDOCK RD' 'FAIRFAX'
'1/1/2014' 'Fairfax County' NaN NaN '12:27' '' 'SULLY RD' 'HERNDON'
'1/1/2014' 'Fairfax County' NaN NaN '11:25' 'MONUMENT DR' 'LEE HWY' 'FAIRFAX'
T1 =
IncidentDate AddressStreet AddressCityIncident AddressCountyIncident AddressState IncidentPostalCode DispatchNotified UnitNotified County_Name City_Name Town_Name CrashTime SecondaryLocation RouteName PostalCityName
____________ ___________________________________________ ___________________ _____________________ ____________ __________________ ________________ ________________ ________________ _________ _________ _________ _________________ __________________ ______________
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33' 'Fairfax County' [NaN] [NaN] '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'BURKE LAKE RD/BURKE RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 1:33' '1/1/2014 1:33' 'Fairfax County' [NaN] [NaN] '1:33' '' 'BURKE LAKE RD' 'BURKE'
'1/1/2014' 'I95 SB TO OLD KEENE MILL RD' 'SPRINGFIELD' 'Fairfax County' 'VA' 22150 '1/1/2014 2:00' '1/1/2014 2:00' '' '' '' '' '' '' ''
'1/1/2014' 'SYDENSTRICKER RD/OLD KEENE MILL RD' 'BURKE' 'Fairfax County' 'VA' 22015 '1/1/2014 4:54' '1/1/2014 4:54' 'Fairfax County' [NaN] [NaN] '4:54' '' 'SYDENSTRICKER RD' 'BURKE'
'1/1/2014' 'RT28 SB THRU RAMP/RT28 SB RAMP TO RT50 WB' 'CHANTILLY' 'Fairfax County' 'VA' 20151 '1/1/2014 12:28' '1/1/2014 12:28' '' '' '' '' '' '' ''
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07' '' '' '' '' '' '' ''
'1/1/2014' '11700 SWARTS DR' 'FAIRFAX' 'Fairfax County' 'VA' 22030 '1/1/2014 13:07' '1/1/2014 13:07' '' '' '' '' '' '' ''
'1/1/2014' 'CENTREVILLE RD/BRADENTON DR' 'CENTREVILLE' 'Fairfax County' 'VA' 20121 '1/1/2014 13:41' '1/1/2014 13:41' 'Fairfax County' [NaN] [NaN] '13:40' 'BRADENTON DR' 'CENTREVILLE RD' 'CENTREVILLE'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:45' '1/1/2014 16:45' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'GEORGETOWN PIKE/CENTRILLION DR' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:42' '1/1/2014 16:42' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' '8526 GEORGETOWN PIKE' 'MCLEAN' 'Fairfax County' 'VA' 22102 '1/1/2014 16:49' '1/1/2014 16:49' 'Fairfax County' [NaN] [NaN] '16:42' '' 'GEORGETOWN PIKE' 'MCLEAN'
'1/1/2014' 'OX RD/BRADDOCK RD' 'FAIRFAX' 'Fairfax County' 'VA' 22032 '1/1/2014 22:32' '1/1/2014 22:32' 'Fairfax County' [NaN] [NaN] '22:19' 'OX RD' 'BRADDOCK RD' 'FAIRFAX'