Haiku Twitter Bot
Over summer 2016, I decided to make a Twitter bot just for fun. Here, I'll describe the bot's programming at a high level. But first, a little background...
I started undergraduate as an english major... then graduated with a degree in neuroscience. Although I have pretty firmly switched my career plans toward math and science, I still appreciate and respect art. As a reflection on my artistic pipedream, I designed my bot to write haikus. If you don't remember haikus from high school, hit that link in the previous sentence.
For connecting to Twitter, I used the tweepy library. I simply wrote a subclass of StreamListener to process incoming tweets and write them to a file called tweets.txt. You can read more about this process in the tweepy docs (linked above).
class StdOutListener(StreamListener): | |
""" A listener handles tweets that are received from the stream. | |
This is a basic listener that just prints received tweets to stdout. | |
""" | |
def __init__(self): | |
self.dat = list() | |
self.i = 0 | |
def on_data(self, data): | |
sdat = unicode(data.split('"text":')[1].split(',')[0].replace('"','')) | |
ssdat = [sw for sw in sdat.split(' ') if not any([na for na in ['/', '\\'] if na in sw])] | |
with open('tweets.txt', 'a+') as fw: | |
for ss in ssdat: | |
if ss not in self.dat: | |
print(ss) | |
fw.write(ss+'\n') | |
self.dat.extend(ssdat) | |
self.i += 1 | |
return True | |
def on_error(self, status): | |
print(status) |
Each time I start my bot, most of the high-level functionality is located in one function, do_tweet().
def do_tweet(auth, tttw=10): | |
api = API(auth) # connect to Twitter using my authorization info | |
words = get_words('tweets.txt') # Get words from general tweets | |
words.extend(get_haikus('haiku.txt')) # Get previously written but un-tweeted haikus | |
words.extend(get_best('best.txt')) # Get list of "best" tweets | |
words.extend(get_words('search.txt')) # Get list of specifically searched terms | |
sdict = make_sdict(words) # Make a dictionary that maps words to their number of syllables | |
# Make a list of haikus and tweet them every 15 minutes | |
haik = list() | |
j = 0 | |
while j < tttw: | |
h = haiku(sdict).lower() | |
haik.append(h) | |
print(h) | |
api.update_status(h) | |
print("Tweeted! at %s" % time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())) | |
time.sleep(900) | |
j+=1 |
For counting syllables and determining parts of speech, I used NLTK.
Counting syllables:
from nltk.corpus import cmudict | |
d = cmudict.dict() | |
def nsyl(word): | |
return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]] |
Choosing words for the haiku:
import numpy as np | |
from nltk.tokenize import word_tokenize | |
from nltk import pos_tag | |
def pick_syl(target): | |
si = 0 | |
choices = list() | |
while si != target: | |
if target-si != 1: | |
cr = np.random.randint(1, target-si) | |
if cr > 3: cr = 1 | |
choices.append(cr) | |
si += cr | |
else: | |
choices.append(1) | |
return choices | |
return choices | |
def syl_of_size(sdict, size): | |
return [w for w in sdict.keys() if sdict[w] == size] | |
def choose_words(sdict, nr_syl, line): | |
words = list() | |
ix = 0 | |
for sw in pick_syl(nr_syl): | |
if ix == 0 and line == 0: | |
sdict1 = dict([(sk, sv) for sk, sv in sdict.items() if pos_tag(word_tokenize(sk), tagset='universal')[0][1]=='NOUN']) | |
gsyls = syl_of_size(sdict1, sw) | |
elif ix == 1 and line == 0: | |
sdict2 = dict([(sk, sv) for sk, sv in sdict.items() if pos_tag(word_tokenize(sk), tagset='universal')[0][1]=='VERB']) | |
gsyls = syl_of_size(sdict2, sw) | |
elif ix == 2 and line == 0: | |
sdict3 = dict([(sk, sv) for sk, sv in sdict.items() if pos_tag(word_tokenize(sk), tagset='universal')[0][1] not in ['NOUN', 'VERB']]) | |
gsyls = syl_of_size(sdict3, sw) | |
else: | |
gsyls = syl_of_size(sdict, sw) | |
cwr = np.random.randint(len(gsyls)) | |
chosen = gsyls[cwr] | |
words.append(chosen) | |
ix += 1 | |
return words |
As you can see, the algorithm randomly chooses a word of syllable length sw, which is generated by the pick_syl() function. If this is hard to interpret, don't worry. It took me a while to come up with an algorithm that would randomly choose words but still adhere to the 5/7/5 haiku format. The conditions like if ix == 0 and line == 0 are there to determine which line of the haiku is being written. In this case, the first word of the first line is being chosen. Then the lines:
sdict1 = dict([(sk, sv) for sk, sv in sdict.items() if pos_tag(word_tokenize(sk),
tagset='universal')[0][1]=='NOUN']) # Chooses a noun
gsyls = syl_of_size(sdict1, sw) # Chooses a noun of syllable length sw
Then, setting up the haiku in order and writing to the file haiku.txt:
def haiku(sdict): | |
sdict = dict([(sk, sv) for sk, sv in sdict.items() if len(sk) >= 3]) | |
first_5 = choose_words(sdict, 5, 0) | |
seven = choose_words(sdict, 7, 1) | |
sec_5 = choose_words(sdict, 5, 2) | |
first_line = ' '.join(first_5) | |
second_line = ' '.join(seven) | |
third_line = ' '.join(sec_5) | |
hai = ', '.join([first_line, second_line, third_line]) | |
with open('haiku.txt', 'a+') as h: | |
h.write(hai+'\n') | |
return hai |
So currently this bot would write haikus with randomly chosen words, which is cool... but we can do better! ;^)
Eventually when I have a little more time, I'll give meta ai ai haiku more smarts, but for now the bot uses a simple algorithm for determining the "best" tweets, then recycles words from these tweets. Dumb, I know... it's a work in progress.
import numpy as np | |
from tweepy import API, OAuthHandler | |
import json | |
auth = OAuthHandler(consumer_key, consumer_secret) # Connect to twitter via oAuth | |
auth.set_access_token(access_token, access_token_secret) | |
api = API(auth) | |
def get_haikus(api): | |
return api.user_timeline('@meta_aiai_haiku') # Get haikus from my timeline | |
def get_search(api): | |
return api.search('genius') # Search for a term (e.g. 'genius') | |
counts = dict() | |
rs = list() | |
fs = list() | |
for h in get_haikus(api): | |
rs.append(h.retweet_count) # Count number of retweets and favorites for each tweet in the list | |
fs.append(h.favorite_count) | |
stext = [ss for ss in h.text.split(' ') if '/' not in ss] | |
counts[' '.join(stext)] = (h.retweet_count, h.favorite_count) # Store in a dict | |
for st in get_search(api): # Do the same for any specific search terms | |
rs.append(st.retweet_count) | |
fs.append(st.favorite_count) | |
stext = [ss for ss in st.text.split(' ') if '/' not in ss] | |
counts[' '.join(stext)] = (st.retweet_count, st.favorite_count) | |
rs = np.array(rs) # Make an array with the retweet counts | |
if any([rr for rr in rs if rr != 0.0]): | |
norm_r = (rs - np.min(rs))*100.00 / ((np.max(rs) - np.min(rs))*100.00) # Normalize the counts | |
else: norm_r = rs | |
fs = np.array(fs) | |
norm_f = (fs - np.min(fs))*100.00 / ((np.max(fs) - np.min(fs))*100.00) # Same for the favorites | |
for ix, k in enumerate(counts.keys()): | |
weight = 0.01*np.random.randint(55, 77) * (norm_r[ix] + norm_f[ix]) # Calculate the likelihood of a tweet showing up by adding these metrics and multiplying by a random value | |
counts[k] = weight | |
with open('best.txt', 'w+') as bf: # dump to best.txt | |
json.dump(counts, bf) |
The function get_best() chooses tweets with the highest "weight" to include in the bot's corpus for writing future haikus.
def get_best(file): | |
words = list() | |
with open(file, 'r') as f: | |
bf = json.load(f) | |
sbf = sorted(bf.items(), key=itemgetter(1), reverse=True) | |
sbf = [bb[0] for bb in sbf] | |
return sbf[0:int(len(sbf)/2.0)] # chooses the top half |
Here's a link to my Twitter bot meta ai ai haiku: robcapps.com/docs/haiku. Please feel free to ask questions in the comments section!
Comments
Post a Comment