使用 python 从 text/javascript 中提取数据
Extract data from text/javascript using python
如何提取 'https://www.example.com/get_file/4/b315c8e0d6fad43d89445378b5292eed6981a999ba/174000/174469/174469.mp4/?br=707' 并使用 'video_url'
[<script src="https://www.example.com/player/kt_player.js?v=5.5.1" type="text/javascript"></script>, <script type="text/javascript">
/* <![CDATA[ */
function getEmbed(width, height) {
if (width && height) {
return '<iframe width="' + width + '" height="' + height + '" src="https://www.example.com/embed/174469" frameborder="0" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>';
}
return '<iframe width="852" height="480" src="https://www.example.com/embed/174469" frameborder="0" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>';
}
var flashvars = {
video_id: '174469', video_categories: 'example_category1', 'example_category2', video_tags: 'example_tag1', 'example_tag2', license_code: '3825119921245', rnd: '1650848189', video_url: 'https://www.example.com/get_file/4/b315c8e0d6fad43d89445378b5292eed6981a999ba/174000/174469/174469.mp4/?br=707', postfix: '.mp4', video_url_text: '480p', video_alt_url: 'https://www.example.com/get_file/4/ffafbe6913656c2250c34bf20fd945a5f86898d749/174000/174469/174469_720p.mp4/?br=1290', video_alt_url_text: '720p', video_alt_url_hd: '1', video_alt_url2: 'https://www.example.com/get_file/4/66c8876a9fd8cd3d823d06880c1797b1424f3200df/174000/174469/174469_1080p.mp4/?br=2559', video_alt_url2_text: '1080p', video_alt_url2_hd: '1', preview_url: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview.jpg', preview_url1: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview.mp4.jpg', preview_height1: '480', preview_url2: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview_720p.mp4.jpg', preview_height2: '720', preview_url3: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview_1080p.mp4.jpg', preview_height3: '1080', skin: 'youtube.css', logo_position: '0,0', logo_anchor: 'topleft', hide_controlbar: '1', hide_style: 'fade', volume: '1', related_src: 'https://www.example.com/related_videos_html/174469/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_pre_skip_duration: '5', adv_pre_skip_text_time: 'Skip ad in %time', adv_pre_skip_text: 'Skip ad', adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_post_skip_duration: '5', adv_post_skip_text_time: 'Skip ad in %time', adv_post_skip_text: 'Skip ad', lrcv: '1651572296480833989009946', vast_timeout1: '10', player_width: '882', player_height: '496.9014084507', embed: '1' };
var player_obj = kt_player('kt_player', 'https://www.example.com/player/kt_player.swf?v=5.5.1', '100%', '100%', flashvars);
window.onload = function() {
$('.pop-adv .btn').click(function(e) {
player_obj.play();
});
};
/* ]]> */
</script>]
我试过了;
import json
script= """[<script src="https://www.example.com/player/kt_player.js?v=5.5.1" type="text/javascript"></script>, <script type="text/javascript">
/* <![CDATA[ */
function getEmbed(width, height) {
if (width && height) {
return '<iframe width="' + width + '" height="' + height + '" src="https://www.example.com/embed/174469" frameborder="0" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>';
}
return '<iframe width="852" height="480" src="https://www.example.com/embed/174469" frameborder="0" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>';
}
var flashvars = {
video_id: '174469', video_categories: 'example_category1, example_category2', video_tags: 'example_tag1, esample_tag2', license_code: '3825119921245', rnd: '1650848189', video_url: 'https://www.example.com/get_file/4/b315c8e0d6fad43d89445378b5292eed6981a999ba/174000/174469/174469.mp4/?br=707', postfix: '.mp4', video_url_text: '480p', video_alt_url: 'https://www.example.com/get_file/4/ffafbe6913656c2250c34bf20fd945a5f86898d749/174000/174469/174469_720p.mp4/?br=1290', video_alt_url_text: '720p', video_alt_url_hd: '1', video_alt_url2: 'https://www.example.com/get_file/4/66c8876a9fd8cd3d823d06880c1797b1424f3200df/174000/174469/174469_1080p.mp4/?br=2559', video_alt_url2_text: '1080p', video_alt_url2_hd: '1', preview_url: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview.jpg', preview_url1: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview.mp4.jpg', preview_height1: '480', preview_url2: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview_720p.mp4.jpg', preview_height2: '720', preview_url3: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview_1080p.mp4.jpg', preview_height3: '1080', skin: 'youtube.css', logo_position: '0,0', logo_anchor: 'topleft', hide_controlbar: '1', hide_style: 'fade', volume: '1', related_src: 'https://www.example.com/related_videos_html/174469/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_pre_skip_duration: '5', adv_pre_skip_text_time: 'Skip ad in %time', adv_pre_skip_text: 'Skip ad', adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_post_skip_duration: '5', adv_post_skip_text_time: 'Skip ad in %time', adv_post_skip_text: 'Skip ad', lrcv: '1651572296480833989009946', vast_timeout1: '10', player_width: '882', player_height: '496.9014084507', embed: '1' };
var player_obj = kt_player('kt_player', 'https://www.example.com/player/kt_player.swf?v=5.5.1', '100%', '100%', flashvars);
window.onload = function() {
$('.pop-adv .btn').click(function(e) {
player_obj.play();
});
};
/* ]]> */
</script>]"""
json_data= json.loads(script)
print(json_data['video_url'])
遇到这个错误,
json.decoder.JSONDecodeError:期望值:第 1 行第 2 列(字符 1)
这不是一个理想的解决方案,因为它依赖于格式非常一致的源文档,但您可以尝试使用正则表达式“手动”解析它:
以下假设您已将 script
设置为包含上面下载的文本。
import re
def extract(name, script):
return re.search(rf'\b{name}\s*:\s*(\'|")(.*?)', script)[2]
extract('video_url', script)
'https://www.example.com/get_file/4/b315c8e0d6fad43d89445378b5292eed6981a999ba/174000/174469/174469.mp4/?br=707'
extract('video_alt_url', script)
'https://www.example.com/get_file/4/ffafbe6913656c2250c34bf20fd945a5f86898d749/174000/174469/174469_720p.mp4/?br=1290'
extract('video_alt_url2', script)
'https://www.example.com/get_file/4/66c8876a9fd8cd3d823d06880c1797b1424f3200df/174000/174469/174469_1080p.mp4/?br=2559'
正则表达式的工作原理如下:
\b{name}\s*:\s*
匹配允许可变间距的 name:
部分
('|")(.*?)
匹配 'some_text'
或 "some_text"
样式字符串
- 最后的
[2]
取第二组,即匹配引号内文字的(.*?)
如何提取 'https://www.example.com/get_file/4/b315c8e0d6fad43d89445378b5292eed6981a999ba/174000/174469/174469.mp4/?br=707' 并使用 'video_url'
[<script src="https://www.example.com/player/kt_player.js?v=5.5.1" type="text/javascript"></script>, <script type="text/javascript">
/* <![CDATA[ */
function getEmbed(width, height) {
if (width && height) {
return '<iframe width="' + width + '" height="' + height + '" src="https://www.example.com/embed/174469" frameborder="0" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>';
}
return '<iframe width="852" height="480" src="https://www.example.com/embed/174469" frameborder="0" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>';
}
var flashvars = {
video_id: '174469', video_categories: 'example_category1', 'example_category2', video_tags: 'example_tag1', 'example_tag2', license_code: '3825119921245', rnd: '1650848189', video_url: 'https://www.example.com/get_file/4/b315c8e0d6fad43d89445378b5292eed6981a999ba/174000/174469/174469.mp4/?br=707', postfix: '.mp4', video_url_text: '480p', video_alt_url: 'https://www.example.com/get_file/4/ffafbe6913656c2250c34bf20fd945a5f86898d749/174000/174469/174469_720p.mp4/?br=1290', video_alt_url_text: '720p', video_alt_url_hd: '1', video_alt_url2: 'https://www.example.com/get_file/4/66c8876a9fd8cd3d823d06880c1797b1424f3200df/174000/174469/174469_1080p.mp4/?br=2559', video_alt_url2_text: '1080p', video_alt_url2_hd: '1', preview_url: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview.jpg', preview_url1: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview.mp4.jpg', preview_height1: '480', preview_url2: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview_720p.mp4.jpg', preview_height2: '720', preview_url3: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview_1080p.mp4.jpg', preview_height3: '1080', skin: 'youtube.css', logo_position: '0,0', logo_anchor: 'topleft', hide_controlbar: '1', hide_style: 'fade', volume: '1', related_src: 'https://www.example.com/related_videos_html/174469/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_pre_skip_duration: '5', adv_pre_skip_text_time: 'Skip ad in %time', adv_pre_skip_text: 'Skip ad', adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_post_skip_duration: '5', adv_post_skip_text_time: 'Skip ad in %time', adv_post_skip_text: 'Skip ad', lrcv: '1651572296480833989009946', vast_timeout1: '10', player_width: '882', player_height: '496.9014084507', embed: '1' };
var player_obj = kt_player('kt_player', 'https://www.example.com/player/kt_player.swf?v=5.5.1', '100%', '100%', flashvars);
window.onload = function() {
$('.pop-adv .btn').click(function(e) {
player_obj.play();
});
};
/* ]]> */
</script>]
我试过了;
import json
script= """[<script src="https://www.example.com/player/kt_player.js?v=5.5.1" type="text/javascript"></script>, <script type="text/javascript">
/* <![CDATA[ */
function getEmbed(width, height) {
if (width && height) {
return '<iframe width="' + width + '" height="' + height + '" src="https://www.example.com/embed/174469" frameborder="0" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>';
}
return '<iframe width="852" height="480" src="https://www.example.com/embed/174469" frameborder="0" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>';
}
var flashvars = {
video_id: '174469', video_categories: 'example_category1, example_category2', video_tags: 'example_tag1, esample_tag2', license_code: '3825119921245', rnd: '1650848189', video_url: 'https://www.example.com/get_file/4/b315c8e0d6fad43d89445378b5292eed6981a999ba/174000/174469/174469.mp4/?br=707', postfix: '.mp4', video_url_text: '480p', video_alt_url: 'https://www.example.com/get_file/4/ffafbe6913656c2250c34bf20fd945a5f86898d749/174000/174469/174469_720p.mp4/?br=1290', video_alt_url_text: '720p', video_alt_url_hd: '1', video_alt_url2: 'https://www.example.com/get_file/4/66c8876a9fd8cd3d823d06880c1797b1424f3200df/174000/174469/174469_1080p.mp4/?br=2559', video_alt_url2_text: '1080p', video_alt_url2_hd: '1', preview_url: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview.jpg', preview_url1: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview.mp4.jpg', preview_height1: '480', preview_url2: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview_720p.mp4.jpg', preview_height2: '720', preview_url3: 'https://www.example.com/contents/videos_screenshots/174000/174469/preview_1080p.mp4.jpg', preview_height3: '1080', skin: 'youtube.css', logo_position: '0,0', logo_anchor: 'topleft', hide_controlbar: '1', hide_style: 'fade', volume: '1', related_src: 'https://www.example.com/related_videos_html/174469/', adv_pre_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_pre_skip_duration: '5', adv_pre_skip_text_time: 'Skip ad in %time', adv_pre_skip_text: 'Skip ad', adv_post_vast: 'https://twinrdsrv.com/preroll.engine?id=613eb379-62dd-49ef-8299-db2b5b2af4d7&zid=12861&cvs={ClientVideoSupport}&time={TimeOffset}&stdtime={StdTimeOffset}&abr={IsAdblockRequest}&pageurl={PageUrl}&tid={TrackingId}&res={Resolution}&bw={BrowserWidth}&bh={BrowserHeight}&kw={Keywords}&referrerUrl={ReferrerUrl}&pw={PlayerWidth}&ph={PlayerHeight}', adv_post_skip_duration: '5', adv_post_skip_text_time: 'Skip ad in %time', adv_post_skip_text: 'Skip ad', lrcv: '1651572296480833989009946', vast_timeout1: '10', player_width: '882', player_height: '496.9014084507', embed: '1' };
var player_obj = kt_player('kt_player', 'https://www.example.com/player/kt_player.swf?v=5.5.1', '100%', '100%', flashvars);
window.onload = function() {
$('.pop-adv .btn').click(function(e) {
player_obj.play();
});
};
/* ]]> */
</script>]"""
json_data= json.loads(script)
print(json_data['video_url'])
遇到这个错误,
json.decoder.JSONDecodeError:期望值:第 1 行第 2 列(字符 1)
这不是一个理想的解决方案,因为它依赖于格式非常一致的源文档,但您可以尝试使用正则表达式“手动”解析它:
以下假设您已将 script
设置为包含上面下载的文本。
import re
def extract(name, script):
return re.search(rf'\b{name}\s*:\s*(\'|")(.*?)', script)[2]
extract('video_url', script)
'https://www.example.com/get_file/4/b315c8e0d6fad43d89445378b5292eed6981a999ba/174000/174469/174469.mp4/?br=707'
extract('video_alt_url', script)
'https://www.example.com/get_file/4/ffafbe6913656c2250c34bf20fd945a5f86898d749/174000/174469/174469_720p.mp4/?br=1290'
extract('video_alt_url2', script)
'https://www.example.com/get_file/4/66c8876a9fd8cd3d823d06880c1797b1424f3200df/174000/174469/174469_1080p.mp4/?br=2559'
正则表达式的工作原理如下:
\b{name}\s*:\s*
匹配允许可变间距的name:
部分('|")(.*?)
匹配'some_text'
或"some_text"
样式字符串- 最后的
[2]
取第二组,即匹配引号内文字的(.*?)