airlines_df.loc[:,['description','statuses_count','user_name','user_screen_name','user_lang']].head(20)
description | statuses_count | user_name | user_screen_name | user_lang | |
---|---|---|---|---|---|
0 | We’re here to offer advice and inspiration for... | 1794976 | American Airlines | AmericanAir | en |
1 | JAL(日本航空)の「今」をお伝えする公式アカウントです!「誰かに話したくなっちゃう!」そん... | 1175864 | JAPAN AIRLINES【JAL】 | JAL_Official_jp | ja |
2 | Official global account of KLM. While we keep ... | 951562 | Royal Dutch Airlines | KLM | en |
3 | Official British Airways Twitter account. We l... | 871422 | British Airways | British_Airways | en |
4 | Conversations with you here are 280 characters... | 866650 | United | united | en |
5 | Official account of Delta Air Lines. We’re lis... | 611941 | Delta | Delta | en |
6 | http://t.co/O12EYOWlB1 Deals, Offers for All F... | 538295 | Qatar Flights | qatarflights | en |
7 | We run on #SouthwestHeart. Follow for news & s... | 534825 | Southwest Airlines | SouthwestAir | en |
8 | Hi, nice to tweet you! Fly with us to 101 dest... | 485314 | JetBlue Airways | JetBlue | en |
9 | #AvGeek researching and writing things about a... | 225071 | Jason Rabinowitz | AirlineFlyer | en |
10 | Official account of Alaska Airlines. Our Socia... | 189825 | Alaska Airlines | AlaskaAir | en |
11 | Welcome to Turkish Airlines Customer Support! ... | 177372 | TK HelpDesk | TK_HelpDesk | tr |
12 | NaN | 177199 | US Airways | USAirways | en |
13 | Travel Agent from Caracas. Find special offers... | 174762 | GregoriaTravels 2.0 | GregoriaTravels | en |
14 | The official Frontier Airlines customer servic... | 147673 | Frontier Airlines | FrontierCare | en |
15 | Worked in Sales & Marketing4Overseas National ... | 142226 | Sweet Little Sheila | Andromodid | en |
16 | 19,,, who is airways | 134677 | nati | illegalxsong | en |
17 | Hotel Booking International and Domestic Airli... | 134636 | Flight Ticket Hotels | Flight_Tickets | en |
18 | Your go-to source for travel news. Covering ev... | 112294 | TravelPulse | TravelPulse | en |
19 | Hello Twitterattis! You can tweet to us with #... | 107852 | Jet Airways | jetairways | en |
# filtering the verified and english accounts
top_20_airlines_df = airlines_df[(airlines_df.verified==True) & (airlines_df.user_lang == 'en')].reset_index(drop=True).head(39)
top_20_airlines_df.loc[:,['description','statuses_count','user_name','user_screen_name','user_lang']].head(20)
description | statuses_count | user_name | user_screen_name | user_lang | |
---|---|---|---|---|---|
0 | We’re here to offer advice and inspiration for... | 1794976 | American Airlines | AmericanAir | en |
1 | Official global account of KLM. While we keep ... | 951562 | Royal Dutch Airlines | KLM | en |
2 | Official British Airways Twitter account. We l... | 871422 | British Airways | British_Airways | en |
3 | Conversations with you here are 280 characters... | 866650 | United | united | en |
4 | Official account of Delta Air Lines. We’re lis... | 611941 | Delta | Delta | en |
5 | We run on #SouthwestHeart. Follow for news & s... | 534825 | Southwest Airlines | SouthwestAir | en |
6 | Hi, nice to tweet you! Fly with us to 101 dest... | 485314 | JetBlue Airways | JetBlue | en |
7 | #AvGeek researching and writing things about a... | 225071 | Jason Rabinowitz | AirlineFlyer | en |
8 | Official account of Alaska Airlines. Our Socia... | 189825 | Alaska Airlines | AlaskaAir | en |
9 | Hello Twitterattis! You can tweet to us with #... | 107852 | Jet Airways | jetairways | en |
10 | This is the OFFICIAL twitter site of Philippin... | 94662 | Philippine Airlines | flyPAL | en |
11 | Welcome to the official Etihad Airways Twitter... | 72080 | Etihad Airways | EtihadAirways | en |
12 | حساب السعودية الرسمي حيث تجدون آخر الأخبار وال... | 69405 | SAUDIA | السعودية | Saudi_Airlines | en |
13 | Defining the Future of Travel. \n\nGlobal trav... | 65015 | Skift | skift | en |
14 | Australia's all day, every day low fares airli... | 57912 | Jetstar Airways | JetstarAirways | en |
15 | Africa's Leading Airline. Th... | 54950 | Kenya Airways | KenyaAirways | en |
16 | Our social media team is at your service for a... | 50850 | Brussels Airlines | FlyingBrussels | en |
17 | Official account of #QatarAirways. Follow us f... | 48429 | Qatar Airways | qatarairways | en |
18 | Welcome aboard Singapore Airlines on Twitter! ... | 46490 | Singapore Airlines | SingaporeAir | en |
19 | This is the Official Spirit Airlines Twitter a... | 37333 | Spirit Airlines | SpiritAirlines | en |
# removing non-airlines accounts and non-english accounts
top_20_airlines_df = top_20_airlines_df.drop(top_20_airlines_df.index[[7,12,13,20]]).reset_index(drop=True)
top_20_airlines_df.loc[:,['description','statuses_count','user_name','user_screen_name','user_lang']].head(20)
description | statuses_count | user_name | user_screen_name | user_lang | |
---|---|---|---|---|---|
0 | We’re here to offer advice and inspiration for... | 1794976 | American Airlines | AmericanAir | en |
1 | Official global account of KLM. While we keep ... | 951562 | Royal Dutch Airlines | KLM | en |
2 | Official British Airways Twitter account. We l... | 871422 | British Airways | British_Airways | en |
3 | Conversations with you here are 280 characters... | 866650 | United | united | en |
4 | Official account of Delta Air Lines. We’re lis... | 611941 | Delta | Delta | en |
5 | We run on #SouthwestHeart. Follow for news & s... | 534825 | Southwest Airlines | SouthwestAir | en |
6 | Hi, nice to tweet you! Fly with us to 101 dest... | 485314 | JetBlue Airways | JetBlue | en |
7 | Official account of Alaska Airlines. Our Socia... | 189825 | Alaska Airlines | AlaskaAir | en |
8 | Hello Twitterattis! You can tweet to us with #... | 107852 | Jet Airways | jetairways | en |
9 | This is the OFFICIAL twitter site of Philippin... | 94662 | Philippine Airlines | flyPAL | en |
10 | Welcome to the official Etihad Airways Twitter... | 72080 | Etihad Airways | EtihadAirways | en |
11 | Australia's all day, every day low fares airli... | 57912 | Jetstar Airways | JetstarAirways | en |
12 | Africa's Leading Airline. Th... | 54950 | Kenya Airways | KenyaAirways | en |
13 | Our social media team is at your service for a... | 50850 | Brussels Airlines | FlyingBrussels | en |
14 | Official account of #QatarAirways. Follow us f... | 48429 | Qatar Airways | qatarairways | en |
15 | Welcome aboard Singapore Airlines on Twitter! ... | 46490 | Singapore Airlines | SingaporeAir | en |
16 | This is the Official Spirit Airlines Twitter a... | 37333 | Spirit Airlines | SpiritAirlines | en |
17 | Memorable journeys inspired by Malaysia's dive... | 33696 | Malaysia Airlines | MAS | en |
18 | SA's loved low cost airline, flying between mo... | 31716 | Mango Airlines | FlyMangoSA | en |
19 | Your source for Hawaiian Airlines news, tips, ... | 29096 | Hawaiian Airlines | HawaiianAir | en |
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment_analyzer = SentimentIntensityAnalyzer()
tweets_df['sentiment_score'] = tweets_df.text.apply(lambda x: sentiment_analyzer.polarity_scores(x)['compound'])
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import classification_report
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re
%matplotlib inline
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
reason_df = pd.read_csv('Tweets_reason.csv')
reason_df=reason_df[reason_df.airline_sentiment == 'negative'].reset_index(drop=True)
reason_df=reason_df[(reason_df.negativereason_confidence == 1) & (reason_df.airline_sentiment_confidence==1)].reset_index(drop=True)
reason_df.negativereason.value_counts()
Customer Service Issue 1317 Late Flight 738 Cancelled Flight 406 Lost Luggage 387 Can't Tell 222 Flight Attendant Complaints 120 Bad Flight 117 Flight Booking Problems 77 Damaged Luggage 27 longlines 25 Name: negativereason, dtype: int64
reason_df.loc[(reason_df.negativereason=='Damaged Luggage') | (reason_df.negativereason=='Lost Luggage'),'negativereason']='Lost and Damaged Luggage'
reason_df=reason_df[(reason_df.negativereason!='longlines') & (reason_df.negativereason!='Flight Booking Problems')]
reason_df.negativereason.value_counts()
Customer Service Issue 1317 Late Flight 738 Lost and Damaged Luggage 414 Cancelled Flight 406 Can't Tell 222 Flight Attendant Complaints 120 Bad Flight 117 Name: negativereason, dtype: int64
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='none',
alpha=0.001,
max_iter=5, tol=None,class_weight='balanced')),])
X_train, X_test, y_train, y_test = train_test_split(reason_df['text'],reason_df.negativereason,test_size=0.33,random_state=53,stratify=reason_df.negativereason)
text_clf.fit(X_train,y_train)
predicted_test = text_clf.predict(X_test)
print(classification_report(predicted_test, y_test))
precision recall f1-score support Bad Flight 0.59 0.55 0.57 42 Can't Tell 0.64 0.57 0.61 82 Cancelled Flight 0.90 0.90 0.90 133 Customer Service Issue 0.86 0.91 0.88 413 Flight Attendant Complaints 0.54 0.51 0.53 41 Late Flight 0.86 0.85 0.85 245 Lost and Damaged Luggage 0.89 0.84 0.87 145 avg / total 0.83 0.83 0.83 1101
parameters = {
'vect__stop_words':('english',None),
'tfidf__norm':('l1','l2'),
'clf__loss': ('hinge','log'),
'clf__alpha': (0.001,0.0001,0.00001, 0.000001),
'clf__penalty': ('l1','l2','none'),
'clf__max_iter': [10,100,1000],
}
grid_clf = GridSearchCV(text_clf,param_grid=parameters,cv=5,n_jobs=5)
grid_clf.fit(reason_df['text'],reason_df.negativereason)
grid_clf.best_score_
0.82153569286142769
grid_clf.best_estimator_.fit(X_train,y_train)
print(classification_report(grid_clf.best_estimator_.predict(X_test), y_test))
precision recall f1-score support Bad Flight 0.44 0.61 0.51 28 Can't Tell 0.73 0.69 0.71 77 Cancelled Flight 0.92 0.88 0.90 140 Customer Service Issue 0.88 0.90 0.89 423 Flight Attendant Complaints 0.62 0.71 0.66 34 Late Flight 0.90 0.85 0.87 257 Lost and Damaged Luggage 0.91 0.88 0.90 142 avg / total 0.86 0.86 0.86 1101
gender_df = pd.read_csv('gender-classifier-DFE-791531.csv',encoding='latin1')
gender_df = gender_df[((gender_df.gender == 'male') | (gender_df.gender == "female")) & (gender_df['gender:confidence'] == 1) ].reset_index(drop=True)
def stem_and_lem(text):
# Remove non-ASCII chars.
text = re.sub('[^\x00-\x7F]+',' ', text)
# Remove double spaces.
text = re.sub('\s+',' ',text)
text = "".join([stemmer.stem(word) for word in text])
text = "".join([lemmatizer.lemmatize(word) for word in text])
return text
gender_df.text=gender_df.text.apply(stem_and_lem)
feature_gender_df = pd.DataFrame()
feature_gender_df['features'] = ''
for k,item in enumerate(gender_df.loc[:,['text','description','sidebar_color','link_color']].astype(str).values):
feature_gender_df.loc[k]=" ".join(item)
gender_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=0.001,
max_iter=5, tol=None,
class_weight='balanced')),])
X_train, X_test, y_train, y_test = train_test_split(feature_gender_df.features,gender_df.gender.astype(str),test_size=0.33,random_state=0,stratify=gender_df.gender)
parameters = {
'vect__stop_words':('english',None),
'tfidf__norm':('l1','l2'),
'clf__loss': ('hinge','log','modified_huber'),
'clf__alpha': (0.001,0.0001,0.00001, 0.000001),
'clf__penalty': ('none','l1','l2'),
'clf__max_iter': [10,100,1000],
}
grid_gender_clf = GridSearchCV(gender_clf,param_grid=parameters,cv=5,n_jobs=5)
grid_gender_clf.fit(feature_gender_df.features,gender_df.gender.astype(str))
grid_gender_clf.best_score_
0.70009980039920161
grid_gender_clf.best_estimator_.fit(X_train,y_train)
print(classification_report(grid_gender_clf.best_estimator_.predict(X_test), y_test))
precision recall f1-score support female 0.73 0.72 0.72 1791 male 0.67 0.68 0.68 1516 avg / total 0.70 0.70 0.70 3307