问题描述
当使用python3和BeautifulSoup从Web获取指定的内容时,我无法在“ td”中获取所有信息。
这是我的代码
import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def main():
try:
url = "http://baike.hrhrs.com/index.php?doc-view-3967.html"
html = getHTMLText(url)
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())
for tag in soup.find_all('h1',class_='title_thema'):
name = tag.find('span', id='doctitle').get_text()
# the first one
table1 = soup.find('table',attrs={'id': 'jqe-table-0'})
tr = table1.find_all('tr')
for trr in tr:
td = trr.findAll('td')
print(td)
这是输出
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A1:</strong>A2</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);"A3:</strong>A4</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A5:</strong>A6</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A7:</strong>A8</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A9:</strong>A10</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A11:</strong>A12</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A13:</strong>A14</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A15:</strong>A16</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A17:</strong>A18</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A19:</strong>A20</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A21:</strong>A22</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A23:</strong>A24</td>]
我使用代码:
print(td[0].text)
结果是:
A1:A2
A5:A6
A9:A10
A13:A14
A17:A18
A21:A22
我想获取“ td”中的所有内容,例如“ A3:A4”并继续。 我该如何更改代码以获取所有内容。希望您的答复!
1楼
根据我的原始代码,只需得到如下结果:
中文名:柳公权
别名:诚悬
出生地:京兆华原(今陕西铜川市耀州区)
民族:汉族
出生年月:公元778年
职业:书法家
更改代码:
import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def main():
try:
for count in range(100,1000):
url = "http://baike.hrhrs.com/index.php?doc-view-"+str(count)+".html"
html = getHTMLText(url)
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())
for tag in soup.find_all('h1', class_='title_thema'):
name = tag.find('span', id='doctitle').get_text()
n[0] = name;
tr = soup.find_all('tr')
for trr in tr:
tdlist = trr.find_all('td')
for i in range(len(tdlist)):
print(str(tdlist[i].text))
except:
print("error")
print("successfully!")
main()
结果:
中文名:柳公权
中文名:柳公权
别名:诚悬
籍贯:唐朝京兆华原(今陕西耀县)
出生地:京兆华原(今陕西铜川市耀州区)
性别:男
民族:汉族
国籍:中国
......
所有内容均可用。
2楼
使用您的代码,在Jupyter Notebook中逐步运行,我得到了
中文名:柳公权
别名:诚悬
出生地:京兆华原(今陕西铜川市耀州区)
民族:汉族
出生年月:公元778年
职业:书法家
你是这个意思吗
这是我的更改:
import requests
from bs4 import BeautifulSoup
url = "http://baike.hrhrs.com/index.php?doc-view-3967.html"
def main():
global url
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
tags = soup.find_all('h1', class_='title_thema')
for tag in tags:
name = tag.find('span', id='doctitle').get_text()
print(name)
table1 = soup.find('table', attrs={'id':'jqe-table-0'})
tr = table1.find_all('tr')
for trr in tr:
td = trr.findAll('td')
print(td[0].text)
if __name__ == '__main__':
main()
3楼
尝试这个..
import requests
from bs4 import BeautifulSoup
url = "http://baike.hrhrs.com/index.php?doc-view-3967.html"
res = requests.get(url)
data = BeautifulSoup(res.content, 'html.parser')
x for x in data.find_all('td')
print (x.text)