#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Conduct a search of the current trends, query each trend
# and save a list of each trend with it's current tweets
#
import sys, time, json, cPickle
from datetime import datetime
import twitter


#
# A very simple error handler for a small number of
# the twitter HTTP error values.
#
def handledHTTPError(err=None,waitT=3):
    if err.e.code == 401:
        # probably ought to just skip this item
        print "Twitter 401 Error, resource is protected."
        return True
    elif err.e.code in [502, 503]:
        print "Twitter %d Error, sleep for %s seconds"%(err.e.code,str(waitT))
        time.sleep(waitT)
        return True
    else:
        return False
    return False


#
# This make the trend query
# The 'woeID' is for "Where On Earth". This is a Yahoo! API that
# Twitter adopted. That value of 1 means "anywhere". You can look at
# the Yahoo! documentation for Where On Earth for other possible values.
#
def queryTrends(woeID=1):
    trendList = []
    tTrends = twitter.Twitter()
    if( tTrends ):
        try:
            trendQuery = tTrends.trends._(woeID)
            trendList = trendQuery()
        except twitter.api.TwitterHTTPError, err:
            trendList = []
            if not handledHTTPError(err):
                print "Twitter Error:",err
    return trendList


#
# The keys that you might extract include:
# "query", "name", "url", "events", "promoted_content"
#
def extractTrendValues(trendList=None, key="query"):
    values = []
    if( trendList ):
        trendDict = trendList[0]['trends']
        values = [ elt[key] for elt in trendDict ]
    return values


#
# Basically, what you've seen before, a paged query of twitter
#
# FYI, in my testing I have found that some of the python code
# for the Twitter API is not completely Unicode compliant. I have
# seen this code crash at the line:
#   r = tSearch.search(q=qs,rpp=ps,page=p)
# It might not be an error in your code (or this code) if stuff
# crashes during a search.
#
def queryTweets(qs="", pages=10, ps=250, report=False):
    results = []
    tSearch = twitter.Twitter(domain="search.twitter.com")
    if( qs and tSearch ):
        st = float(ps/1000)
        for p in range(1,(pages+1)):
            try:
                r = tSearch.search(q=qs,rpp=ps,page=p)
                if( r ):
                    items = len(r['results'])
                    if( report ):
                        print "\tpage: %d, got %d items"%(p,items)
                    results.append(r)
                    # wait a small fraction of time after each request
                    # this helps throttle our request frequency
                    time.sleep(st)    
            except twitter.api.TwitterHTTPError, err:
                if not handledHTTPError(err):
                    raise err
    return results


#
# The returned query structure has some meta data associated with the
# actual results. This hides lots of that complexity and would not be
# appropriate for every kind of analysis. But this does give you an
# idea of how you might work to extract some pieces of information.
# Keep in mind that "results" is a nested list. 
#
# Meta data about the query would not be "in_results", so keys that
# you might extract with in_results=False include:
#   "completed_in", "query", "refresh_url", "max_id", "page"
# These keys don't always make sense to collect. For example why would
# you want a list of the "page" numbers you collected if you already
# know that you wanted to collect 10 pages?
#
# The individual tweet data *is* "in_results", so the keys that you
# might want to extract with in_results=True include:
#   "created_at", "from_user", "from_user_id", "from_user_name",
#   "id", "in_reply_to_status_id", "profile_image_url", "text",
#   "to_user", "to_user_id", "to_user_name", "source"
#
# The default behavior will get you a single list of just the items
# you have requested with the "key" parameter. However, if the
# "pair" parameter is set to True then you'll get a pair wise
# list of lists, where each nested list is the value of "pair_key"
# matched with the value of "key" 
#
def extractTweetValues(pageList=None, key='text', in_results=True, pair=False, pair_key='from_user'):
    values = []
    if( (not pageList) or (not key) ):
        return values
    if( in_results ):
        if( pair ):
            values = [ [t[pair_key],t[key]] for p in pageList for t in p['results'] ]
        else:
            values = [ t[key] for p in pageList for t in p['results'] ]        
    else:
        values = [ page[key] for page in pageList ]
    return values


#
# This simply pickles the data that was collected. The nice thing here
# is that this will time and datestamp the file. You can run this program
# seconds apart and the files won't collide. This could be good for a
# time series analysis. Of course, you might really want this in a DB or
# structured some other way.
#
def saveTwitterData(fname="twitter_trends",tData=None):
    if( not tData ):
        return False
    dtstr = str(datetime.now())
    dtstr = dtstr.replace(' ','_')
    dtstr = dtstr.replace(':','')
    dtstr = dtstr.split('.')
    fn = "%s.%s.pickle"%(fname,dtstr[0])
    f = open(fn,"wb")
    if( f ):
        cPickle.dump(tData,f)
        f.close()
        return True
    else:
        print "Error: saveTwitterData(), could not open file."
        return False
    return False



def main(argv):
    print "Fetch trending data"
    trendData = queryTrends()
    #print json.dumps(trendData, sort_keys=True, indent=2)
    print "Convert trend data to list of search terms"
    # the 'query' value is url encoded
    queryTermList = extractTrendValues(trendData,key="query")
    # the 'name' value is printing friendly
    printTermList = extractTrendValues(trendData,key="name")
    trend_dict = {}
    print "Query each term, collecting results"
    i = 0
    while( i < len(queryTermList) ):
        qt = queryTermList[i]
        pt = printTermList[i].encode('utf-8')
        trend_tweets = []
        print "Query for: \"%s\""%(pt)
        queryPages = queryTweets(qs=qt,pages=10,ps=100,report=True)
        #print json.dumps(queryPages, sort_keys=True, indent=2)
        tweets = extractTweetValues(queryPages,key="text",pair=True,pair_key='from_user')
        if( tweets ):
            print "\tTotal tweets for \"%s\": %d"%(pt,len(tweets))
            for item in tweets:
                trend_tweets.append(item)
        else:
            print "\tReceived 0 (zero) items for \"%s\"."%(pt)
        # now save the list of tweets with a key that is the trend
        trend_dict[pt] = trend_tweets
        i += 1

    print "Pickle/save the results"
    saveTwitterData(fname="TTrendData",tData=trend_dict)


if __name__ == '__main__':
    main(sys.argv)   
