Twitter Negative / Positive Analysis Using RNN-Part 1

What is RNN?

It is difficult to judge negatives and positives by reading the context of sentences with a polar dictionary. Proper analysis may not be possible.

Here, from the flow of sentences using a recurrent neural network (RNN) Learn how to perform negative and positive analysis.

Because RNNs can memorize and learn previously calculated information Predict the word that comes after the sentence It is used for machine translation because of the probability of word appearance.

Of course, you can do it in Japanese, but due to the relationship of the data, I will analyze it in English this time.

Twitter Negative / Positive Analysis

Twitter has a character limit of 140 characters The user sends in a short sentence.

Because you can get a lot of data It is used for analysis of various natural language processing including negative and positive analysis.

This time, we will learn using Twitter data about US Airline. For the data, use Airline Twitter sentiment distributed by Figure Eight in the United States.

Click here for License

import pandas as pd

Tweet = pd.read_csv('./6020_negative_positive_data/data/Airline-Sentiment-2-w-AA.csv', encoding='cp932')
tweetData = Tweet.loc[:,['text', 'airline_sentiment']]
print(tweetData)

Creating a database

Delete frequent words

Since RNN analyzes word relevance, you need to remove frequently occurring words.

Frequently used words such as I and what are called stop words. Google's search engine excludes stop words from the search target to increase the relevance of other words.

Also, on Twitter, the words @ and flight, which mean replies, appear frequently. Create data with those words removed.

import nltk
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
#If you get an error with stopwords, please execute ↓
#
#nltk.download('stopwords')

#Load Tweet data
Tweet = pd.read_csv('./6020_negative_positive_data/data/Airline-Sentiment-2-w-AA.csv', encoding='cp932')
tweetData = Tweet.loc[:,['text','airline_sentiment']]

#Performs morphological analysis of English Tweet
def tweet_to_words(raw_tweet):
    
    # a~Create a list of words starting with z separated by spaces
    letters_only = re.sub("[^a-zA-Z@]", " ",raw_tweet) 
    words = letters_only.lower().split()
    
    # '@'When'flight'が含まれる文字Whenストップワードを削除します
    stops = set(stopwords.words("english"))  
    meaningful_words = [w for w in words if not w in stops and not re.match("^[@]", w) and not re.match("flight",w)] 
    return( " ".join( meaningful_words )) 

cleanTweet = tweetData['text'].apply(lambda x: tweet_to_words(x))
print(cleanTweet)

Create a database of words

To find out which words affect negatives and positives Create a database of all words once.

Use this database to tag words frequently and negative / positive.

import nltk
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords

Tweet = pd.read_csv('./6020_negative_positive_data/data/Airline-Sentiment-2-w-AA.csv', encoding='cp932')
tweetData = Tweet.loc[:,['text','airline_sentiment']]

def tweet_to_words(raw_tweet):
    
    # a~Create a list of words starting with z separated by spaces
    letters_only = re.sub("[^a-zA-Z@]", " ",raw_tweet) 
    words = letters_only.lower().split()
    
    # '@'When'flight'が含まれる文字Whenストップワードを削除します
    stops = set(stopwords.words("english"))  
    meaningful_words = [w for w in words if not w in stops and not re.match("^[@]", w) and not re.match("flight",w)] 
    return( " ".join( meaningful_words )) 

cleanTweet = tweetData['text'].apply(lambda x: tweet_to_words(x)) 

#Create a database
all_text = ' '.join(cleanTweet)
words = all_text.split()
print(words)

Digitize words

Numerical tagging is performed on each word based on the number of occurrences of the word. In addition, we will create a new list by digitizing the cleanTweet character string used for learning this time.

import nltk
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from collections import Counter

Tweet = pd.read_csv('./6020_negative_positive_data/data/Airline-Sentiment-2-w-AA.csv', encoding='cp932')
tweetData = Tweet.loc[:,['text','airline_sentiment']]

def tweet_to_words(raw_tweet):
    
    # a~Create a list of words starting with z separated by spaces
    letters_only = re.sub("[^a-zA-Z@]", " ",raw_tweet) 
    words = letters_only.lower().split()
    
    # '@'When'flight'が含まれる文字Whenストップワードを削除します
    stops = set(stopwords.words("english"))  
    meaningful_words = [w for w in words if not w in stops and not re.match("^[@]", w) and not re.match("flight",w)] 
    return( " ".join( meaningful_words )) 

cleanTweet = tweetData['text'].apply(lambda x: tweet_to_words(x))

#Create a database
all_text = ' '.join(cleanTweet)
words = all_text.split()

#Count the number of times a word appears
counts = Counter(words)
#Sort in descending order
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
#print(vocab_to_int)
#Stores the digitized string in a new list
tweet_ints = []
for each in cleanTweet:
    tweet_ints.append([vocab_to_int[word] for word in each.split()])
    
print(tweet_ints)

Negative / Positive Quantification

Quantify the negative / positive evaluation given for each sentence. This time, we will convert to negative = 0, positive = 1, and neutral = 2.

This negative / positive number is used when learning sentences generated from words.

import numpy as np
import pandas as pd

#Load Tweet data
Tweet = pd.read_csv('./6020_negative_positive_data/data/Airline-Sentiment-2-w-AA.csv', encoding='cp932')
tweetData = Tweet.loc[:,['text','airline_sentiment']]

#Tweet negative/Converts a positive string to a number
labels = np.array([0 if each == 'negative' else 1 if each == 'positive' else 2 for each in tweetData['airline_sentiment'][:]]) 

print(labels)

Align the number of columns

In the created tweet_ints, the number of different words is stored for each tweet.

You need to align the columns of the list as you learn. Also, the lines whose number of words has become 0 due to the cleanTweet process are deleted from each list.

from collections import Counter

#This is the code content of the previous section-------------------------
import numpy as np
import pandas as pd

#Load Tweet data
Tweet = pd.read_csv('./6020_negative_positive_data/data/Airline-Sentiment-2-w-AA.csv', encoding='cp932')
tweetData = Tweet.loc[:,['text','airline_sentiment']]

#Tweet negative/Converts a positive string to a number
labels = np.array([0 if each == 'negative' else 1 if each == 'positive' else 2 for each in tweetData['airline_sentiment'][:]])

# ----------------------------------------

#Stores the digitized string in a new list
tweet_ints = []
for each in cleanTweet:
    tweet_ints.append([vocab_to_int[word] for word in each.split()])

#Find out the number of words in Tweet
tweet_len = Counter([len(x) for x in tweet_ints])
print(tweet_len)
seq_len = max(tweet_len)
print("Zero-length reviews: {}".format(tweet_len[0]))
print("Maximum review length: {}".format(max(tweet_len)))

#Remove the lines where the number of words is 0 with cleanTweet from each list
tweet_idx  = [idx for idx,tweet in enumerate(tweet_ints) if len(tweet) > 0]
labels = labels[tweet_idx]
tweetData = tweetData.iloc[tweet_idx]
tweet_ints = [tweet for tweet in tweet_ints if len(tweet) > 0]

#Create a framework with digitized words for each i-row written from the right to align the number of columns
features = np.zeros((len(tweet_ints), seq_len), dtype=int)
for i, row in enumerate(tweet_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

print(features)

Summary

Creating a dataset from Tweet data

1,Loading Tweet data

2,Morphological analysis of Tweet data

3,Creating a database of words

4,Features(Database)Creation

import nltk
import numpy as np
import pandas as pd
import re
from collections import Counter
from nltk.corpus import stopwords

#Load Tweet data
Tweet = pd.read_csv('./6020_negative_positive_data/data/Airline-Sentiment-2-w-AA.csv', encoding='cp932')
tweetData = Tweet.loc[:,['text','airline_sentiment']]

#Performs morphological analysis of English Tweet
def tweet_to_words(raw_tweet):
    
    # a~Create a list of words starting with z separated by spaces
    letters_only = re.sub("[^a-zA-Z@]", " ",raw_tweet) 
    words = letters_only.lower().split()
    
    # '@'When'flight'が含まれる文字Whenストップワードを削除します
    stops = set(stopwords.words("english"))  
    meaningful_words = [w for w in words if not w in stops and not re.match("^[@]", w) and not re.match("flight",w)] 
    return( " ".join(meaningful_words)) 

cleanTweet = tweetData['text'].apply(lambda x: tweet_to_words(x))

#Create a database
all_text = ' '.join(cleanTweet)
words = all_text.split()

#Count the number of times a word appears
counts = Counter(words)

#Sort in descending order
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

#Stores the digitized string in a new list
tweet_ints = []
for each in cleanTweet:
    tweet_ints.append([vocab_to_int[word] for word in each.split()])

#Find out the number of words in Tweet
tweet_len = Counter([len(x) for x in tweet_ints])
seq_len = max(tweet_len)
print("Zero-length reviews: {}".format(tweet_len[0]))
print("Maximum review length: {}".format(max(tweet_len)))

#Remove the lines where the number of words is 0 with cleanTweet from each list
tweet_idx  = [idx for idx,tweet in enumerate(tweet_ints) if len(tweet) > 0]
tweet_ints = [tweet for tweet in tweet_ints if len(tweet) > 0]

#Create a framework with digitized words for each i-row written from the right to align the number of columns
features = np.zeros((len(tweet_ints), seq_len), dtype=int)
for i, row in enumerate(tweet_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]
print(features)

Python: Negative / Positive Analysis: Twitter Negative / Positive Analysis Using RNN-Part 1