tmove parsers to factolib.py - counterfacto - small software tool to analyze twitter and highlight counterfactual statements HTML git clone git://parazyd.org/counterfacto.git DIR Log DIR Files DIR Refs DIR README DIR LICENSE --- DIR commit 5d141366aa5bc60d8ee62bc7229daa5b8d6d617b DIR parent a6e2fee6da44bd214d6db79113be431142bb60bf HTML Author: parazyd <parazyd@dyne.org> Date: Thu, 9 Mar 2017 00:14:49 +0100 move parsers to factolib.py Diffstat: M .gitignore | 3 ++- M README.md | 31 +++++++++++++++++-------------- M counterfacto-web | 52 ++++++++++++++++++++++++++----- A factolib.py | 199 +++++++++++++++++++++++++++++++ M templates/index.html | 6 +++--- 5 files changed, 266 insertions(+), 25 deletions(-) --- DIR diff --git a/.gitignore b/.gitignore t@@ -1,2 +1,3 @@ credentials -twokenize.pyc +*.pyc +*.txt DIR diff --git a/README.md b/README.md t@@ -1,5 +1,5 @@ - -# Counterfacto +Counterfacto +============ Counterfactual (noun) t@@ -11,7 +11,7 @@ Effects: it starts off with disappointment, then one will be able to uncover insights or knowledge that can be used to enhance future performance, leading to a better outcome in life. ----------------------------------------------------------------------------------- +----------------------------------------------------------------------- Counterfacto is a small software tool that can analyse search results on twitter to highlight counterfactual statements on certain topics. t@@ -24,7 +24,8 @@ We deem such a tool as a useful experiment, considering the importance of counterfactual analysis for political sentiment assessments and focus on news stories. -## Dependencies +Dependencies +------------ Python is required along the following packages: t@@ -43,7 +44,8 @@ After installing the necessary python modules, run `make`, which will then download the needed data for nltk, and tell you how to use your twitter credentials in counterfacto -### Running the web edition of counterfacto +Running the web edition of counterfacto +--------------------------------------- To run counterfacto along with its web interface, you will need additional dependencies: t@@ -52,7 +54,8 @@ additional dependencies: python-flask ``` -## Usage +Usage +----- ``` usage: ./counterfacto [-a account] [-f tweetfile] [-s searchterm] t@@ -64,17 +67,17 @@ The web interface can be ran with (port defaults to 5000): usage: ./counterfacto-web [-p port] ``` -## References - -- [Learning Representations for Counterfactual Inference (2016)](http://jmlr.org/proceedings/papers/v48/johansson16.pdf) - -- [Bounding and Minimizing Counterfactual Error (2016)](https://arxiv.org/abs/1606.03976) +References +---------- -- [Counterfactuals in the Language of Social Media: A Natural Language Processing Project in Conjunction with the World Well Being Project (2015)](http://www.seas.upenn.edu/~cse400/CSE400_2015_2016/reports/report_15.pdf) +* [Learning Representations for Counterfactual Inference (2016)](http://jmlr.org/proceedings/papers/v48/johansson16.pdf) +* [Bounding and Minimizing Counterfactual Error (2016)](https://arxiv.org/abs/1606.03976) +* [Counterfactuals in the Language of Social Media: A Natural Language Processing Project in Conjunction with the World Well Being Project (2015)](http://www.seas.upenn.edu/~cse400/CSE400_2015_2016/reports/report_15.pdf) -## Licensing +Licensing +--------- -Counterfacto is Copyright (C) 2016 by the Dyne.org Foundation +Counterfacto is Copyright (C) 2016-2017 by the Dyne.org Foundation as part of the PIEnews project Software written by Ivan J. <parazyd@dyne.org> DIR diff --git a/counterfacto-web b/counterfacto-web t@@ -1,10 +1,37 @@ #!/usr/bin/env python2 +# Copyright (c) 2017 Ivan J. <parazyd@dyne.org> +import json +import sys from flask import Flask, render_template, request, json -import os +from twitter import * + +import factolib + + +global tweetsFile +global taggedFile + +taggedFile = "tagged.txt" + +try: + with open('credentials') as fd: + exec(fd.read()) +except: + print("no credentials file found. please create it") + sys.exit(1) app = Flask(__name__) +def writetweets(tweets, twfile): + twfile = open(twfile, "w") + for s in tweets: + sintweet = s["text"] + sintweet = sintweet.replace("\n", " ") + sintweet = sintweet.encode("ascii", "ignore") + twfile.write(sintweet + "\n") + twfile.close() + @app.route("/") def main(): return render_template('index.html') t@@ -17,10 +44,23 @@ def search(): if not _name or not _method: return "Wrong data. Please try again." + api = Twitter(auth=OAuth(oatoken,oasecret,conskey,conssecret)) + if _method == "account": - os.system("./counterfacto -a " + _name) + statuses = api.statuses.user_timeline(screen_name=_name, count=100) + tweetsFile = "fetchedtweets-" + _name + ".txt" + writetweets(statuses, tweetsFile) + factolib.classify(tweetsFile, taggedFile) + elif _method == "searchterm": - os.system("./counterfacto -s " + _name) + statuses = api.search.tweets(q=_name, count=1) + tweetsFile = "fetchedsearch.txt" + writetweets(statuses, tweetsFile) + factolib.classify(tweetsFile, taggedFile) + + cfs = "counterfactuals.txt" + with open(cfs) as f: + return f.read() if __name__ == "__main__": try: t@@ -29,7 +69,5 @@ if __name__ == "__main__": except: _port = 5000 - app.run( - host="0.0.0.0", - port=int(_port) - ) + app.run(host="127.0.0.1", port=int(_port)) + #subprocess.call(["xdg-open", "http://127.0.0.1:" + _port]) DIR diff --git a/factolib.py b/factolib.py t@@ -0,0 +1,199 @@ +#!/usr/bin/env python2 +# Copyright (c) 2017 Ivan J. <parazyd@dyne.org + +import nltk +import re +import twokenize +from nltk.tag.perceptron import PerceptronTagger + +def tokenizelocal(): + tweets = tweetFile.read().splitlines() + for t in tweets: + print(t + '\n') + print(str(twokenize.tokenize(t)) + '\n') + +def format_tweet(message): + m = str(message) + m = m.replace('\n', ' ') + m = m.encode('ascii', 'ignore') + return m + +def format_tagged(tagged_list): + out = '' + for t in tagged_list: + token, tag = postprocess_tag(t[0], t[1]) + out = out + token + '/' + tag + '/' + out = out + '\n' + return out + +def postprocess_tag(token, tag): + outtag = tag + if (is_twitter_cf_modal(token)): + outtag = 'MD' + elif (tag_CCJ(token)): + outtag = 'CCJ' + return token, outtag + +def get_cf_form(tagged_message): + + # Filter out questions + pq = re.compile('\.*/\?/.', re.IGNORECASE) + if pq.search(tagged_message) != None: + return 0 + + # CASE 1 WISH VERB FORM + p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE) + if p1.search(tagged_message) != None: + return 1 + + + # CASE 2 CONJUNTION NORMAL + p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE) + if p2.search(tagged_message) != None: + return 2 + + + # CASE 3 CONJUNCTIVE CONVERSE + p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE) + if p3.search(tagged_message) != None: + return 3 + + + # CASE 5 Should have + p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE) + if p4.search(tagged_message) != None: + return 4 + + # CASE 6 VERB INVERSION + p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)" + "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)" + "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE) + if p5.search(tagged_message) != None: + return 5 + + + # CASE 6 MODAL NORMAL + p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE) + if p6.search(tagged_message) != None: + return 6 + + # If no matches + return 0 + + +def is_twitter_cf_modal(word): + w = unicode(word, errors='ignore').encode('utf-8').lower() + if (w == 'should' or + w == 'should\'ve' or + w == 'shouldve' or + w == 'shoulda' or + w == 'shulda' or + w == 'shuda' or + w == 'shudda' or + w == 'shudve' or + w == 'would' or + w == 'would\'ve' or + w == 'wouldve' or + w == 'woulda' or + w == 'wuda' or + w == 'wulda' or + w == 'wudda' or + w == 'wudve' or + w == 'wlda' or + w == 'could' or + w == 'could\'ve' or + w == 'couldve' or + w == 'coulda' or + w == 'cudda' or + w == 'culda' or + w == 'cudve' or + w == 'must' or + w == 'mustve' or + w == 'might' or + w == 'might\'ve' or + w == 'mightve' or + w == 'ought' or + w == 'may' or + w == 'i\'d' or + w == 'id' or + w == 'we\'d' or + w == 'youd' or + w == 'you\'d' or + w == 'he\'d' or + w == 'she\'d'): + return True + return False + +def tag_CCJ(word): + w = word.lower() + ''' + as long as, even if, if, one condition that, provided (that), + providing (that), so long as, unless, whether... or, supposing, + suppose, imagine, but for + ''' + if(w == 'as' or + w == 'if' or + w == 'even' or + w == 'provided' or + w == 'providing' or + w == 'suppose' or + w == 'supposing' or + w == 'unless' or + w == 'whether' or + w == 'envision' or + w == 'envisioning' or + w == 'conceptualize'or + w == 'conceptualizing' or + w == 'conjure' or + w == 'conjuring' or + w == 'visualize' or + w == 'visualizing'): + return True + return False + +def get_tagged_message(message, tagger): + tagset = None + formatted_message = format_tweet(message) + tokens = twokenize.tokenize(formatted_message) + tags = nltk.tag._pos_tag(tokens, tagset, tagger) + return format_tagged(tags) + +def classify(tweetfile, taggedfile): + tweetfile = open(tweetfile, "r") + taggedfile = open(taggedfile, "w") + counterfactuals = open('counterfactuals.txt', 'w') + + tagger = PerceptronTagger() + form_num = 8 + + cf_count = [[0 for x in range(form_num)] for x in range(form_num)] + + form_vec = [] + + print("Reading file...") + tweet = tweetfile.readline() + + while tweet: + taggedTweet = get_tagged_message(tweet, tagger) + taggedfile.write(taggedTweet) + form = int(get_cf_form(taggedTweet)) + + if form: + print(tweet) + counterfactuals.write(tweet + '<hr>\n') + + form_vec.append(form) + cf_count[form][0] += 1 + tweet = tweetfile.readline() + + count = 0 + for i in xrange(1, form_num): + count += cf_count[i][0] + + print("Finished tagging...") + tweetfile.close() + taggedfile.close() + + print("counterfactuals: " + str(count) + "/100") + counterfactuals.write("counterfactuals: " + str(count) + "/100<br>\n") + counterfactuals.close() DIR diff --git a/templates/index.html b/templates/index.html t@@ -11,9 +11,9 @@ <script type="text/javascript" href="../static/js/search.js"></script> </head> <body> - <div class="container"> + <!-- <div class="container"> <h3 class="text-muted">Counterfacto</h3> - </div> + </div> --> <div class="jumbotron"> <h1>Counterfacto</h1> t@@ -28,7 +28,7 @@ </div> <footer class="footer"> - <p>© PIEnews / Dyne.org 2016-2116</p> + <center><p>© PIEnews / Dyne.org Foundation 2016-2017</p></center> </footer> </body> </html>