抓取微博示例程序
本帖最后由 xiaoye 于 2015-9-30 11:20 编辑由于在做舆情处理,需要大量的数据,所以就出现了这个东西,调用的微博公共API,如果有问题
查询微博文档
代码如下:
#encoding = utf8
import weibo
import sys, os, urllib, urllib2
import json
import cookielib
import ConfigParser
import time
import MySQLdb
import re
reload(sys)
sys.setdefaultencoding('utf8')
db_host=""
db_data=""
db_user=""
db_pass=""
db_port=""
APP_KEY="162454*****" #你的APP_KEY
APP_SECRET="753847a8d16ead8a8e7618*****************"#你的app_secret
REDIRECT_URL="https://api.weibo.com/oauth2/default.html"
USER_ID="crawlertest@sina.cn"
USER_PASS="12345*******"
client=weibo.APIClient(APP_KEY,APP_SECRET,REDIRECT_URL)
def make_access_token():
params = urllib.urlencode({
'action':'submit',
'withOfficalFlag':'0',
'ticket':'',
'isLoginSina':'',
'response_type':'code',
'regCallback':'',
'redirect_uri':REDIRECT_URL,
'client_id':APP_KEY,
'state':'',
'from':'',
'userId':USER_ID,
'passwd':USER_PASS,
})
login_url = 'https://api.weibo.com/oauth2/authorize'
url = client.get_authorize_url()
content = urllib2.urlopen(url)
if content:
headers = { 'Referer' : url }
request = urllib2.Request(login_url, params, headers)
opener = get_opener(False)
urllib2.install_opener(opener)
try:
f = opener.open(request)
return_redirect_uri = f.url
except urllib2.HTTPError, e:
return_redirect_uri = e.geturl()
code = return_redirect_uri.split('=')
token = client.request_access_token(code,REDIRECT_URL)
save_access_token(token)
def get_opener(proxy=False):
rv=urllib2.build_opener(get_cookie(), SmartRedirectHandler())
rv.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)')]
return rv
def get_cookie():
cookies = cookielib.CookieJar()
return urllib2.HTTPCookieProcessor(cookies)
class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_301(self,cls, req, fp, code, msg, headers):
result = urllib2.HTTPRedirectHandler.http_error_301(cls, req, fp, code, msg, headers)
result.status = code
print headers
return result
def http_error_302(self,cls, req, fp, code, msg, headers):
result = urllib2.HTTPRedirectHandler.http_error_302(cls, req, fp, code, msg, headers)
result.status = code
print headers
return result
def save_access_token(token):
time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
cf=ConfigParser.ConfigParser()
cf.read("./token.ini")
cf.set("token","token",token.access_token)
cf.set("token","expires_in",token.expires_in )
cf.set("token","data",time)
cf.write(open("./token.ini","w"))
def apply_access_token():
cf=ConfigParser.ConfigParser()
cf.read("./token.ini")
access_token=cf.get("token","access_token")
expires_in=cf.get("token","expires_in")
#print access_token
#print expires_in
client.set_access_token(access_token, expires_in)
def get_public_weibo():
try:
text=client.statuses.public_timeline.get()
except StandardError,e:
if e.error_code==10023:
print u"当前账号已经超出使用限制,请稍候再试!"
exit(0)
time.sleep(3600)
js=json.dumps(text)
js=json.loads(js)
content=js['statuses']
lenth=len(content)
max=js['total_number']
for i in range(0,lenth-1):
try:
source=content["source"]
source=re.match(r'''<a href="(.+?)" rel="nofollow">(.+?)</a>''',source)
date=content['created_at']
mid=content["idstr"]
text=content['text']
source=source.group(2)
reports=content['reposts_count']
comments=content['comments_count']
atti=content['attitudes_count']
uid=content['user']['id']
nick=content['user']['screen_name']
loca=content['user']['location']
deci=content['user']['description']
gender=content['user']['gender']
verti=content['user']['verified']
follws=content['user']['bi_followers_count']
value=
db_save_info(value)
print u"插入第%s条数据成功!"%i
except:
pass
def db_get_config():
cf=ConfigParser.ConfigParser()
cf.read("./token.ini")
db_host=cf.get("dbinfo", "dbserver")
db_data=cf.get("dbinfo","db_data")
db_user=cf.get("dbinfo","db_user")
db_pass=cf.get("dbinfo","password")
db_port=cf.get("dbinfo","port")
def db_save_info(value):
#value=['2012','10000000','test','test','2','2','2','32111','fuck','1','1','1','1']
conn=MySQLdb.connect("127.0.0.1","root","123456",charset="utf8")
conn.select_db("db_info")
cursor=conn.cursor()
cursor.execute("SET NAMES 'utf8'")
cursor.execute('''insert into weibo_info values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''',value)
conn.commit()
cursor.close()
def get_db_count():
conn=MySQLdb.connect("127.0.0.1","root","123456")
conn.select_db("db_info")
cursor=conn.cursor()
cursor.execute("select count(*) from weibo_info")
result=cursor.fetchall()
for i in result:
result=i
return result
if __name__=="__main__":
print u'''
###################################
# @Author iceforce
# @date 2014-11-17
##################################
#抓取公共微博信息示例
#
##################################
'''
count=get_db_count()
apply_access_token()
for i in range(0,150):
count1=get_db_count()
count2=count1-count
print u"=======================================第%s次执行,已插入%s条数据============================================="%(i,count2)
get_public_weibo()
print u"============================================执行完成!============================================================"
学习学习技术,加油! 学习学习技术,加油! 还是不错的哦,顶了 感谢楼主的分享~ 支持中国红客联盟(ihonker.org)
页:
[1]