Advance Data Collection including Images Using Twitter API

An updated code to download twitter data with images.
Author

Mohammad Belal

Published

August 28, 2022

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
from datetime import datetime, timedelta

import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time
from pathlib import Path
#to read excel
!pip install openpyxl
Requirement already satisfied: openpyxl in c:\users\acer\anaconda3\lib\site-packages (3.0.9)
Requirement already satisfied: et-xmlfile in c:\users\acer\anaconda3\lib\site-packages (from openpyxl) (1.1.0)
#put your Twitter API token
os.environ['TOKEN'] = '#put your token'
def auth():
    return os.getenv('TOKEN')
i=0
df_final=pd.DataFrame()


#function for data crawling
def create_headers():
    headers = {"Authorization": "Bearer {}".format(os.environ['TOKEN'])}
    return headers

def create_url(keyword, start_date, end_date, max_results=10, next_token=None):
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from
    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id,attachments.media_keys',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'media.fields':'url',
                    'next_token': {}}
    if next_token is not None:
      query_params['next_token'] = next_token
    return (search_url, query_params)

def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def get_result_from_Twitter_API(keyword, current,start_time, end_time, max_results=10):
  if max_results > 500:
    max = 500
  else:
    max = max_results
  next_token = None
  result = []  
  count = 0
  media_count = 0
  pos_count = 0
  neg_count = 0
  neu_count = 0
  score = 0
  while count == 0 or next_token != None:
    headers = create_headers()
    url = create_url(keyword, start_time, end_time, max)
    json_response = connect_to_endpoint(url[0], create_headers(), url[1], next_token)
    media_info = []
    if 'includes' in json_response and json_response['includes'] is not None and 'media' in json_response['includes'] and json_response['includes']['media'] is not None:
      media_info = json_response['includes']['media']
    
    data_info = json_response['data']
    # print(json.dumps(json_response['meta'], indent=4, sort_keys=True))
    # print(json.dumps(json_response, indent=4, sort_keys=True))
    # print(json.dumps(media_info, indent=4, sort_keys=True))
    # print(json.dumps(data_info, indent=4, sort_keys=True))
    for data in data_info:
      
      image_links = []
      media_keys = []
      if 'attachments' in data and data['attachments'] is not None and 'media_keys' in data['attachments'] and data['attachments']['media_keys'] is not None:
        media_keys = data['attachments']['media_keys']
      for media_key in media_keys:
        if len(media_info) > 0:
          index = next((i for i, item in enumerate(media_info) if 'media_key' in item and item['media_key'] == media_key), -1)
          if index != -1 and 'url' in media_info[index]:
            image_links.append(media_info[index]['url'])
      media_count = media_count + len(image_links)

      obj = {
          "created_at": data['created_at'],
          "source": data['source'],
          "author_id": data['author_id'],
          "conversation_id": data['conversation_id'],
          "id": data['id'],
          "text": data['text'],
          "keyword":current,
          "image_count": len(image_links),
          "image_link": image_links
      }
      result.append(obj)
      count = count + 1
      if count >= max_results:
        break
    if count < max_results and 'meta' in json_response and json_response['meta'] is not None and 'next_token' in json_response['meta']:
      next_token = json_response['meta']['next_token']
    else:
      next_token = None
  if pos_count > neg_count:
    overall = "positive"
  elif neg_count > pos_count:
    overall = "negative"
  else:
    overall = "neutral"
  score = score / len(result)
  jsonObj = {"summary":{"overall":overall, "count":count, "media_count":media_count, "pos_count":pos_count, "neg_count":neg_count, "neu_count":neu_count, "score":score}, 'result':result}
  return jsonObj

def download_img(url, path, name):
  Path(path+"/").mkdir(parents=True, exist_ok=True)
  if os.path.exists(name):
    os.remove(name)
  img_data = requests.get(url).content
  with open('{}/{}.jpg'.format(path, name), 'wb') as handler:
      handler.write(img_data)
  return path + "/" + name
li=['football world cup','soccer world cup']
#@title Insert the date
root = '/content/'
starttime = "2022-08-01" #@param {type:"date"}
num = 1 #@param {type:"integer"}
upload = False #@param {type:"boolean"}
start_time = starttime + "T00:00:00Z"
max_results = 100


for i in range(0, num):
  df_final=pd.DataFrame()
  st_date = dateutil.parser.parse(starttime)
  st = st_date + timedelta(days=i)
  start_time = st.strftime("%Y-%m-%dT%H:%M:%SZ")
  et = st_date + timedelta(days=(i+1))
  end_time = et.strftime("%Y-%m-%dT%H:%M:%SZ")
  print("=============== " + start_time + " to " + end_time + " =====================")

  max_results = 100000 
  final_json_result = None
  for j in li:
    keyword = "(" + j + ") " + " lang:en -is:retweet"
    current=j
    json_result=get_result_from_Twitter_API(keyword,current,start_time, end_time,max_results)
    time.sleep(5)  
    df=pd.DataFrame(json_result['result'])
    df_final=df_final.append(df)
    df_final.drop_duplicates(subset=['id'],inplace=True)
  print(st.date())    
  df_final.to_excel('Qatar{}.xlsx'.format(st.date()))
=============== 2022-08-01T00:00:00Z to 2022-08-02T00:00:00Z =====================
Endpoint Response Code: 200
Endpoint Response Code: 200
Endpoint Response Code: 200
2022-08-01
df_final
created_at source author_id conversation_id id text keyword image_count image_link
0 2022-08-01T23:56:34.000Z dlvr.it 2534574684 1554254794260238337 1554254794260238337 Breaking down how the Asian qualifiers will wo... football world cup 0 []
1 2022-08-01T23:48:46.000Z Twitter Web App 1135812161522020352 1554252833322663942 1554252833322663942 The #VodacomNelsonMandelaCup an amalgamation o... football world cup 1 [https://pbs.twimg.com/media/FZHPGlqXwAIgTvs.png]
2 2022-08-01T23:45:25.000Z Twitter Web App 1163702851396362240 1553811467958452227 1554251987851644928 But at least there isn't too much dispute abou... football world cup 0 []
3 2022-08-01T23:42:37.000Z Twitter for iPhone 1242899809880678403 1553808295219150853 1554251284538167296 @RobynsBookshelf @kajoken3 @lfcdaniell You lit... football world cup 0 []
4 2022-08-01T23:34:23.000Z Twitter Web App 1378325044800143360 1554249213889896453 1554249213889896453 Qatar World Cup: Qatar tourism begins huge dri... football world cup 2 [https://pbs.twimg.com/media/FZHNSzyWIAM5X_Z.j...
... ... ... ... ... ... ... ... ... ...
170 2022-08-01T00:36:07.000Z Twitter Web App 1283317908 1553501018536775681 1553902360430264320 @TommyB358 @ScottLou4Life @Zamir_Nation @mcr_b... soccer world cup 0 []
171 2022-08-01T00:33:23.000Z Twitter Web App 1024518848383410177 1553447793808719874 1553901671863816194 @PrinceofAg @PrimeTimeRURS Bro respect my brai... soccer world cup 0 []
172 2022-08-01T00:25:42.000Z Twitter for iPhone 17792271 1553899739451637766 1553899739451637766 Just now catching up on women’s soccer. I’m so... soccer world cup 0 []
174 2022-08-01T00:10:14.000Z Twitter for iPhone 1054860611828490240 1553895847129223170 1553895847129223170 Fun fact regarding today’s Women’s Euro 2022 F... soccer world cup 0 []
175 2022-08-01T00:08:40.000Z Twitter for Android 167653265 1553821136974069760 1553895451815923712 @Recall_Gavin_21 More people watched the US Wo... soccer world cup 1 [https://pbs.twimg.com/media/FZCLrbdUYAAB_bj.jpg]

1035 rows × 9 columns

image_link=df_final.iloc[1,8]
image_link
['https://pbs.twimg.com/media/FZHPGlqXwAIgTvs.png']
#getting the images from url
from PIL import Image
import requests
from io import BytesIO

response = requests.get(image_link[0])
img = Image.open(BytesIO(response.content))
img