如果在列表中找不到键,如何获得默认值零?
How do I get a default value zero if key isn't found in the list?
for url in urls:
uClient = ureq(url)
page_html = uClient.read()
uClient.close()
soup = BeautifulSoup(page_html, "html.parser")
text = (''.join(s.findAll(text=True))for s in soup.findAll('p'))
c = Counter((re.sub(r"[^a-zA-Z0-9 ]","",x)).strip(punctuation).lower() for y in text for x in y.split())
for key in sorted(c.keys()):
l.append([key, c[key]])
d = collections.defaultdict(list)
for k, v in l:
d[k].append(v)
print(d.items())
我得到的输出是:
([('', [3, 9, 4, 1]), ('1', [1, 2, 2]), ('1960', [1]), ('1974', [1]), ('1996', [1]), ('1997', [1]), ('1998', [1]), ('2001', [2]), ('2002', [1]), ...
如果在列表中找不到键,我想要一个默认值 0。例如,如果 Key: g 在第一个列表中出现 1 次,在第二个列表中出现 0 次,在第三个列表中出现 3 次,在第四个列表中出现 6 次。它应该 return: 'g':[1,0,3,6]
编辑:
这是我完整代码中的注释行,以显示未成功的试验:
#m = list(map(dict, map(zip, list_1, list_2)))
#matrix = pd.DataFrame.from_dict(d, orient='index')
matrix = pd.DataFrame({ key:pd.Series(value) for key, value in d.items() })
我有一个名为“urls.txt”的文本文件,其中包含 URL:
https://en.wikipedia.org/wiki/Data_science
https://datajobs.com/what-is-data-science
我需要所有唯一字母数字的文档术语矩阵。让我们说词数据和科学:
一排应该是 [Document number, term 'data', term 'science']
它应该显示为:
data science
1 96 65
2 105 22
3 0 16
我非常接近,但无法以正确的方式做到这一点。尝试过列表到数据框,字典到数据框,完全由数据框但没有任何效果。到处搜索,找不到类似的东西。
我正在回答我自己的问题,因为我可以想出一种方法并将其张贴在这里以防有人需要帮助:
import requests
from bs4 import BeautifulSoup
import collections
from string import punctuation
from urllib.request import urlopen as ureq
import re
import pandas as pd
import numpy as np
import operator
Q1= open ("Q1.txt", "w")
def web_parsing(filename):
with open (filename, "r") as df:
urls = df.readlines()
url_number = 0
url_count = []
l = {}
d = []
a =[]
b = []
e=[]
for url in urls:
uClient = ureq(url)
page_html = uClient.read()
uClient.close()
soup = BeautifulSoup(page_html, "html.parser")
text = (''.join(s.findAll(text=True))for s in soup.findAll('p'))
c = Counter((re.sub(r"[^a-zA-Z0-9 ]","",x)).strip(punctuation).lower() for y in text for x in y.split())
for key in c.keys():
if key in a:
continue
else:
a.append(key)
#print(sorted(a))
a = list(filter(None, a))
#print(sorted(a))
stopfile = open('stop_words.txt', 'r')
stopwords = [line.split(',') for line in stopfile.readlines()]
#print(stopwords)
a = [item for item in a if item not in stopwords]
#print(len(a))
l = [list(([word, c[word]])) for word in a]
l =sorted(l)
flat_list = [item for sublist in l for item in sublist]
d.extend(flat_list)
b = {d[i]: d[i+1] for i in range(0, len(d), 2)}
e.append(b)
j=0
for url in urls:
j = j+1
#print(j)
result = {}
for key in a:
for i in range(0,j):
if key in e[i]: result.setdefault(key, []).append(e[i][key])
if key not in e[i]: result.setdefault(key, []).append(0)
#print (result)
#print (result)
od = collections.OrderedDict(sorted(result.items()))
#print(od)
df1 = pd.DataFrame(od)
df2 =df1.loc[:, ['data', 'companies', 'business', 'action', 'mining', 'science']]
#return(df2)
df1.to_csv(Q1, header=True)
df2.to_csv(Q1, header=True)
print(len(a))
return(df1)
for url in urls:
uClient = ureq(url)
page_html = uClient.read()
uClient.close()
soup = BeautifulSoup(page_html, "html.parser")
text = (''.join(s.findAll(text=True))for s in soup.findAll('p'))
c = Counter((re.sub(r"[^a-zA-Z0-9 ]","",x)).strip(punctuation).lower() for y in text for x in y.split())
for key in sorted(c.keys()):
l.append([key, c[key]])
d = collections.defaultdict(list)
for k, v in l:
d[k].append(v)
print(d.items())
我得到的输出是:
([('', [3, 9, 4, 1]), ('1', [1, 2, 2]), ('1960', [1]), ('1974', [1]), ('1996', [1]), ('1997', [1]), ('1998', [1]), ('2001', [2]), ('2002', [1]), ...
如果在列表中找不到键,我想要一个默认值 0。例如,如果 Key: g 在第一个列表中出现 1 次,在第二个列表中出现 0 次,在第三个列表中出现 3 次,在第四个列表中出现 6 次。它应该 return: 'g':[1,0,3,6]
编辑:
这是我完整代码中的注释行,以显示未成功的试验:
#m = list(map(dict, map(zip, list_1, list_2)))
#matrix = pd.DataFrame.from_dict(d, orient='index')
matrix = pd.DataFrame({ key:pd.Series(value) for key, value in d.items() })
我有一个名为“urls.txt”的文本文件,其中包含 URL:
https://en.wikipedia.org/wiki/Data_science
https://datajobs.com/what-is-data-science
我需要所有唯一字母数字的文档术语矩阵。让我们说词数据和科学:
一排应该是 [Document number, term 'data', term 'science']
它应该显示为:
data science
1 96 65
2 105 22
3 0 16
我非常接近,但无法以正确的方式做到这一点。尝试过列表到数据框,字典到数据框,完全由数据框但没有任何效果。到处搜索,找不到类似的东西。
我正在回答我自己的问题,因为我可以想出一种方法并将其张贴在这里以防有人需要帮助:
import requests
from bs4 import BeautifulSoup
import collections
from string import punctuation
from urllib.request import urlopen as ureq
import re
import pandas as pd
import numpy as np
import operator
Q1= open ("Q1.txt", "w")
def web_parsing(filename):
with open (filename, "r") as df:
urls = df.readlines()
url_number = 0
url_count = []
l = {}
d = []
a =[]
b = []
e=[]
for url in urls:
uClient = ureq(url)
page_html = uClient.read()
uClient.close()
soup = BeautifulSoup(page_html, "html.parser")
text = (''.join(s.findAll(text=True))for s in soup.findAll('p'))
c = Counter((re.sub(r"[^a-zA-Z0-9 ]","",x)).strip(punctuation).lower() for y in text for x in y.split())
for key in c.keys():
if key in a:
continue
else:
a.append(key)
#print(sorted(a))
a = list(filter(None, a))
#print(sorted(a))
stopfile = open('stop_words.txt', 'r')
stopwords = [line.split(',') for line in stopfile.readlines()]
#print(stopwords)
a = [item for item in a if item not in stopwords]
#print(len(a))
l = [list(([word, c[word]])) for word in a]
l =sorted(l)
flat_list = [item for sublist in l for item in sublist]
d.extend(flat_list)
b = {d[i]: d[i+1] for i in range(0, len(d), 2)}
e.append(b)
j=0
for url in urls:
j = j+1
#print(j)
result = {}
for key in a:
for i in range(0,j):
if key in e[i]: result.setdefault(key, []).append(e[i][key])
if key not in e[i]: result.setdefault(key, []).append(0)
#print (result)
#print (result)
od = collections.OrderedDict(sorted(result.items()))
#print(od)
df1 = pd.DataFrame(od)
df2 =df1.loc[:, ['data', 'companies', 'business', 'action', 'mining', 'science']]
#return(df2)
df1.to_csv(Q1, header=True)
df2.to_csv(Q1, header=True)
print(len(a))
return(df1)