爬虫整理（一）使用re和urllib

完整代码

from urllib.request import urlopen


html = urlopen(
    "https://morvanzhou.github.io/static/scraping/basic-structure.html").read(
        
    ).decode('utf-8')
print(html)


import re
res = re.findall(r"<title>(.+?)</title>", html)
print("\nPage title is: ", res[0])


res = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL) 
print("\nPage paragraph is: ", res[0])


res = re.findall(r'href="(.*?)"', html)
print("\nAll links: ", res)

正文

使用re和urllib

from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen(
    "https://morvanzhou.github.io/static/scraping/basic-structure.html"
).read().decode('utf-8')  
print(html) # 打开，读取，转换可显示中文，最后打印出来

结果显示
print(html)

接下来，使用re筛选数据

import re
res = re.findall(r"<title>(.+?)</title>", html) 
print(res)  # ['Scraping tutorial 1 | 莫烦Python'] # 列表
print(res[0]) # Scraping tutorial 1 | 莫烦Python
res = re.findall(r"<p>(.*?)</p>", html)
print(res)  # []

res = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL) 
# re.DOTALL if multi line
prnt(res)
print(res[0])

以上五个的print数据

1 2	res = re.findall(r'href="(.*?)"', html) print("\nAll links: ", res)

筛选链接