|
原帖由 bubill 于 2009-6-19 22:31 发表 ![]()
# -*- coding: utf-8 -*-
#WebPageContent.py for Python 2.5.4
import urllib
'''Grabing WebPageContent'''
def getWebPageContent(url):
f = urllib.urlopen(url)
data = f.read()
f.close()
return data
url = 'http://www.itpub.com'
content = getWebPageContent(url)
#将抓取的网页保存到WebPageContent.txt文件中
WebPageContent = open('G:\\WebPageContent.txt', 'a')
print >>WebPageContent, content
WebPageContent.close()
#指定的是utf-8,可是输出文件后变成ANSI啦,不知道怎么搞!?
easy_install 一个chartdet包
用chardet.detect(content)看一下编码,然后content.decode(chardet.detect).encode("utf8")转码之后存到文件中 |
|