王珂 发表于 2015-3-3 18:56:56

抓取微博示例程序

本帖最后由 xiaoye 于 2015-9-30 11:20 编辑

由于在做舆情处理,需要大量的数据,所以就出现了这个东西,调用的微博公共API,如果有问题
查询微博文档
代码如下:
#encoding = utf8

import weibo
import sys, os, urllib, urllib2
import json
import cookielib
import ConfigParser
import time
import MySQLdb
import re

reload(sys)
sys.setdefaultencoding('utf8')

db_host=""
db_data=""
db_user=""
db_pass=""
db_port=""

APP_KEY="162454*****" #你的APP_KEY
APP_SECRET="753847a8d16ead8a8e7618*****************"#你的app_secret
REDIRECT_URL="https://api.weibo.com/oauth2/default.html"

USER_ID="crawlertest@sina.cn"
USER_PASS="12345*******"
client=weibo.APIClient(APP_KEY,APP_SECRET,REDIRECT_URL)
def make_access_token():
    params = urllib.urlencode({
    'action':'submit',
    'withOfficalFlag':'0',
    'ticket':'',
    'isLoginSina':'',
    'response_type':'code',
    'regCallback':'',
    'redirect_uri':REDIRECT_URL,
    'client_id':APP_KEY,
    'state':'',
    'from':'',
    'userId':USER_ID,
    'passwd':USER_PASS,
    })
    login_url = 'https://api.weibo.com/oauth2/authorize'
    url = client.get_authorize_url()
    content = urllib2.urlopen(url)
    if content:
      headers = { 'Referer' : url }
    request = urllib2.Request(login_url, params, headers)
    opener = get_opener(False)
    urllib2.install_opener(opener)
    try:
      f = opener.open(request)
      return_redirect_uri = f.url
    except urllib2.HTTPError, e:
      return_redirect_uri = e.geturl()
   
    code = return_redirect_uri.split('=')
   
    token = client.request_access_token(code,REDIRECT_URL)
    save_access_token(token)
   
def get_opener(proxy=False):
    rv=urllib2.build_opener(get_cookie(), SmartRedirectHandler())
    rv.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)')]
    return rv

def get_cookie():
    cookies = cookielib.CookieJar()
    return urllib2.HTTPCookieProcessor(cookies)

class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_301(self,cls, req, fp, code, msg, headers):
      result = urllib2.HTTPRedirectHandler.http_error_301(cls, req, fp, code, msg, headers)
      result.status = code
      print headers
      return result

    def http_error_302(self,cls, req, fp, code, msg, headers):
      result = urllib2.HTTPRedirectHandler.http_error_302(cls, req, fp, code, msg, headers)
      result.status = code
      print headers
      return result
   
def save_access_token(token):
    time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    cf=ConfigParser.ConfigParser()
    cf.read("./token.ini")
    cf.set("token","token",token.access_token)
    cf.set("token","expires_in",token.expires_in )
    cf.set("token","data",time)   
    cf.write(open("./token.ini","w"))
   
def apply_access_token():
    cf=ConfigParser.ConfigParser()
    cf.read("./token.ini")
    access_token=cf.get("token","access_token")
    expires_in=cf.get("token","expires_in")
    #print access_token
    #print expires_in
   
    client.set_access_token(access_token, expires_in)
   
def get_public_weibo():
    try:
      text=client.statuses.public_timeline.get()
    except StandardError,e:
      if e.error_code==10023:
            print u"当前账号已经超出使用限制,请稍候再试!"
            exit(0)
      time.sleep(3600)
    js=json.dumps(text)
    js=json.loads(js)
    content=js['statuses']
    lenth=len(content)
    max=js['total_number']
    for i in range(0,lenth-1):
      try:
            source=content["source"]
            source=re.match(r'''<a href="(.+?)" rel="nofollow">(.+?)</a>''',source)
            date=content['created_at']
            mid=content["idstr"]
            text=content['text']
            source=source.group(2)
            reports=content['reposts_count']
            comments=content['comments_count']
            atti=content['attitudes_count']
            uid=content['user']['id']
            nick=content['user']['screen_name']
            loca=content['user']['location']
            deci=content['user']['description']
            gender=content['user']['gender']
            verti=content['user']['verified']
            follws=content['user']['bi_followers_count']
            value=
            db_save_info(value)
            print u"插入第%s条数据成功!"%i
      except:
            pass
   

      
      
      
      
      
def db_get_config():
    cf=ConfigParser.ConfigParser()
    cf.read("./token.ini")
    db_host=cf.get("dbinfo", "dbserver")
    db_data=cf.get("dbinfo","db_data")
    db_user=cf.get("dbinfo","db_user")
    db_pass=cf.get("dbinfo","password")
    db_port=cf.get("dbinfo","port")
   

def db_save_info(value):
    #value=['2012','10000000','test','test','2','2','2','32111','fuck','1','1','1','1']
    conn=MySQLdb.connect("127.0.0.1","root","123456",charset="utf8")
    conn.select_db("db_info")
    cursor=conn.cursor()
    cursor.execute("SET NAMES 'utf8'")
    cursor.execute('''insert into weibo_info values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''',value)
    conn.commit()
    cursor.close()

   
def get_db_count():
    conn=MySQLdb.connect("127.0.0.1","root","123456")
    conn.select_db("db_info")
    cursor=conn.cursor()
    cursor.execute("select count(*) from weibo_info")
    result=cursor.fetchall()
    for i in result:
      result=i
    return result


if __name__=="__main__":
    print u'''
###################################
# @Author iceforce
# @date 2014-11-17
##################################
#抓取公共微博信息示例
#
##################################
            '''
    count=get_db_count()
    apply_access_token()
    for i in range(0,150):
      count1=get_db_count()
      count2=count1-count      
      print u"=======================================第%s次执行,已插入%s条数据============================================="%(i,count2)
      get_public_weibo()

    print u"============================================执行完成!============================================================"

   

Jack-5 发表于 2015-6-28 04:02:49

学习学习技术,加油!

wtsqq123 发表于 2015-6-30 01:34:39

学习学习技术,加油!

arctic 发表于 2015-6-30 09:33:06

还是不错的哦,顶了

H.U.C-麦麦 发表于 2015-6-30 09:36:19

感谢楼主的分享~

r00tc4 发表于 2015-7-1 17:29:18

支持中国红客联盟(ihonker.org)
页: [1]
查看完整版本: 抓取微博示例程序