Here you can find the entire code.

I wanted to download some flascards from twitter, so that is why I code this crawler to download it.

# Import Libs
import tweepy 
import wget
import os

# Here you need to add your API credentials.
# You can get those here: https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api
api_key             = "XPTO"
api_secret_key      = "XPTO"
access_token        = "XPTO"
access_token_secret = "XPTO"

consumer_key        = api_key
consumer_secret     = api_secret_key
access_token        = access_token
access_token_secret = access_token_secret

#Fill these in
uth = tweepy.OAuthHandler(consumer_key, consumer_secret) 
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

# Get 200 tweets from Chris
tweets = api.user_timeline(screen_name = 'chrisalbon', 
                           count = 200, 
                           include_rts = False, 
                           excludereplies = True)

# 200 is not enough. Keep receiving tweets until we can no longer receive
last_id = tweets[-1].id
 
while (True):
    more_tweets = api.user_timeline(screen_name='chrisalbon',
                                count=200,
                                include_rts=False,
                                exclude_replies=True,
                                max_id=last_id-1)
                                    
    # No more tweets
    if (len(more_tweets) == 0):
          break
    else:
        last_id = more_tweets[-1].id-1
        tweets += more_tweets


# Chris stopped using a hashtag and started linking to a URL
def has_ML_url(s):
    urls = s.entities.get('urls')
    if urls:
        return(urls[0].get('display_url') == 'machinelearningflashcards.com')
    else:
        return False

# Filter by those that contain machinelearningflashcards.com
card_tweets = [tweet for tweet in tweets if has_ML_url(tweet)]


media_files = dict()
for status in card_tweets:
    title = status.text.split(' http')[0]
    media = status.entities.get('media', [])
    # if tweet has media and media is photo
    if(len(media) > 0 and media[0]['type']=='photo' ):
        # get the URL
        media_files[title] = media[0]['media_url']  


# create a directory to store your photos
os.makedirs('ml-cards', exist_ok=True) 

for title, url in media_files.items():
   # Get the photos!
   wget.download(url, out = "ml-cards/{}.png".format(title))

Leave a Reply