tcounterfacto - counterfacto - small software tool to analyze twitter and highlight counterfactual statements
HTML git clone git://parazyd.org/counterfacto.git
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
tcounterfacto (8746B)
---
1 #!/usr/bin/env python2
2 # Counterfacto is Copyright (c) 2016 by the Dyne.org Foundation
3 # as part of the PIEnews project
4 #
5 # This file is part of Counterfacto
6 # Written by Ivan J. <parazyd@dyne.org>
7 #
8 # This source code is free software; you can redistribute it and/or
9 # modify it under the terms of the GNU Public License as published by
10 # the Free Software Foundation; either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This source code is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Please refer
16 # to the GNU Public License for more details.
17 #
18 # You should have received a copy of the GNU Public License along with
19 # this source code; if not, write to: Free Software Foundation, Inc.,
20 # 675 Mass Ave, Cambridge, MA 02139, USA.
21 #
22 # This project has received funding from the European Union's Horizon
23 # 2020 Programme for research, technological development and
24 # demonstration under grant agreement nr. 687922
25
26 import nltk
27 from nltk.tag.perceptron import PerceptronTagger
28 import re
29 import sys
30 from twitter import *
31 import twokenize
32
33 global tweetfile
34 global taggedFile
35 taggedFile = 'tagged.txt'
36
37 try:
38 with open('credentials') as fd:
39 exec(fd.read())
40 except:
41 print('no credentials file found. please create it.')
42 exit(1)
43
44 def main():
45 ## credential check json
46 #print(api.VerifyCredentials())
47
48 try:
49 if sys.argv[1] == '-f':
50 tweetfile = sys.argv[2]
51 classify(tweetfile)
52
53 elif sys.argv[1] == '-a':
54 api = Twitter(auth=OAuth(oatoken,oasecret,conskey,conssecret))
55 accountname = sys.argv[2]
56 statuses = api.statuses.user_timeline(screen_name=accountname,
57 count=100)
58
59 tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt'
60 tweetFile = open(tweetfile, 'w')
61
62 for s in statuses:
63 sintweet = s['text']
64 sintweet = sintweet.replace('\n', ' ')
65 sintweet = sintweet.encode('ascii', 'ignore')
66 tweetFile.write(sintweet + '\n')
67 #print('wrote tweet')
68
69 tweetFile.close()
70 classify(tweetfile)
71
72 elif sys.argv[1] == '-s':
73 api = twitter.Api(consumer_key=twit_consumer_key,
74 consumer_secret=twit_consumer_secret,
75 access_token_key=twit_access_key,
76 access_token_secret=twit_access_secret)
77
78 if len(sys.argv) >= 3:
79 searchterm = ' '.join(sys.argv[2:])
80 else:
81 searchterm = sys.argv[2]
82
83 statuses = api.GetSearch(term=searchterm,
84 count=100)
85 #result_type="recent")
86
87 #for s in statuses:
88 # print(s.text)
89 #exit()
90
91 tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt'
92 tweetFile = open(tweetfile, 'w')
93
94 for s in statuses:
95 sintweet = s.text
96 sintweet = sintweet.replace('\n', ' ')
97 sintweet = sintweet.encode('ascii', 'ignore')
98 tweetFile.write(sintweet + '\n')
99 #print('wrote tweet')
100
101 tweetFile.close()
102 classify(tweetfile)
103
104 except:
105 print("usage: " + sys.argv[0] + " [-a account] [-f tweetfile] [-s searchterm]")
106 exit(1)
107
108 ## {{{ processing functions
109 def tokenizelocal():
110 tweets = tweetFile.read().splitlines()
111 for t in tweets:
112 print(t + '\n')
113 print(str(twokenize.tokenize(t)) + '\n')
114
115 def format_tweet(message):
116 m = str(message)
117 m = m.replace('\n', ' ')
118 m = m.encode('ascii', 'ignore')
119 return m
120
121 def format_tagged(tagged_list):
122 out = ''
123 for t in tagged_list:
124 token, tag = postprocess_tag(t[0], t[1])
125 out = out + token + '/' + tag + '/'
126 out = out + '\n'
127 return out
128
129 def postprocess_tag(token, tag):
130 outtag = tag
131 if (is_twitter_cf_modal(token)):
132 outtag = 'MD'
133 elif (tag_CCJ(token)):
134 outtag = 'CCJ'
135 return token, outtag
136
137 def get_cf_form(tagged_message):
138
139 # Filter out questions
140 pq = re.compile('\.*/\?/.', re.IGNORECASE)
141 if pq.search(tagged_message) != None:
142 return 0
143
144 # CASE 1 WISH VERB FORM
145 p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE)
146 if p1.search(tagged_message) != None:
147 return 1
148
149
150 # CASE 2 CONJUNTION NORMAL
151 p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE)
152 if p2.search(tagged_message) != None:
153 return 2
154
155
156 # CASE 3 CONJUNCTIVE CONVERSE
157 p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE)
158 if p3.search(tagged_message) != None:
159 return 3
160
161
162 # CASE 5 Should have
163 p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE)
164 if p4.search(tagged_message) != None:
165 return 4
166
167 # CASE 6 VERB INVERSION
168 p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)"
169 "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)"
170 "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE)
171 if p5.search(tagged_message) != None:
172 return 5
173
174
175 # CASE 6 MODAL NORMAL
176 p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE)
177 if p6.search(tagged_message) != None:
178 return 6
179
180 # If no matches
181 return 0
182
183
184
185 def is_twitter_cf_modal(word):
186 w = unicode(word, errors='ignore').encode('utf-8').lower()
187 if (w == 'should' or
188 w == 'should\'ve' or
189 w == 'shouldve' or
190 w == 'shoulda' or
191 w == 'shulda' or
192 w == 'shuda' or
193 w == 'shudda' or
194 w == 'shudve' or
195 w == 'would' or
196 w == 'would\'ve' or
197 w == 'wouldve' or
198 w == 'woulda' or
199 w == 'wuda' or
200 w == 'wulda' or
201 w == 'wudda' or
202 w == 'wudve' or
203 w == 'wlda' or
204 w == 'could' or
205 w == 'could\'ve' or
206 w == 'couldve' or
207 w == 'coulda' or
208 w == 'cudda' or
209 w == 'culda' or
210 w == 'cudve' or
211 w == 'must' or
212 w == 'mustve' or
213 w == 'might' or
214 w == 'might\'ve' or
215 w == 'mightve' or
216 w == 'ought' or
217 w == 'may' or
218 w == 'i\'d' or
219 w == 'id' or
220 w == 'we\'d' or
221 w == 'youd' or
222 w == 'you\'d' or
223 w == 'he\'d' or
224 w == 'she\'d'):
225 return True
226 return False
227
228 def tag_CCJ(word):
229 w = word.lower()
230 '''
231 as long as, even if, if, one condition that, provided (that),
232 providing (that), so long as, unless, whether... or, supposing,
233 suppose, imagine, but for
234 '''
235 if(w == 'as' or
236 w == 'if' or
237 w == 'even' or
238 w == 'provided' or
239 w == 'providing' or
240 w == 'suppose' or
241 w == 'supposing' or
242 w == 'unless' or
243 w == 'whether' or
244 w == 'envision' or
245 w == 'envisioning' or
246 w == 'conceptualize'or
247 w == 'conceptualizing' or
248 w == 'conjure' or
249 w == 'conjuring' or
250 w == 'visualize' or
251 w == 'visualizing'):
252 return True
253 return False
254
255 def get_tagged_message(message, tagger):
256 tagset = None
257 formatted_message = format_tweet(message)
258 tokens = twokenize.tokenize(formatted_message)
259 tags = nltk.tag._pos_tag(tokens, tagset, tagger)
260 return format_tagged(tags)
261 ## }}}
262
263 def classify(tweetfile):
264 tweetFile = open(tweetfile, 'r')
265 tagFile = open(taggedFile, 'w')
266
267 tagger = PerceptronTagger()
268 form_num = 7
269
270 cf_count = [[0 for x in range(form_num)] for x in range(form_num)]
271
272 form_vec = []
273
274 print("Reading file...")
275 tweet = tweetFile.readline()
276
277 while tweet != '':
278 taggedTweet = get_tagged_message(tweet, tagger)
279 tagFile.write(taggedTweet)
280 #print("did tweet")
281 form = int(get_cf_form(taggedTweet))
282
283 ## if our tweet is positive, print it
284 if form != 0:
285 print(tweet)
286
287 form_vec.append(form)
288
289 cf_count[form][0] = cf_count[form][0] + 1
290
291 tweet = tweetFile.readline()
292
293 count = 0
294 for i in xrange(1,form_num):
295 count = count + cf_count[i][0]
296
297
298 print("finished tagging...")
299 tweetFile.close()
300 tagFile.close()
301
302 print("counterfactuals: " + str(count) + "/100")
303
304 main()