from random import shuffle

def getsamples(lines: list):
    n = len(lines)
    i = 1
    samples = []
    while i < n:
        line = lines[i]
        xy = line.split(',')
        x = xy[0].strip().split(' ')
        if len(xy) == 1: # multiline tweet
            line = lines[i][1:].rstrip() # remove double quote
            i += 1
            if len(line) == 0 and i < n:
                # beginning of tweet so dont put space
                line += lines[i].split(',')[0].rstrip()
                i += 1
            while i < n and len(lines[i].split(',')) == 1:
                line += ' ' + lines[i].rstrip()
                i += 1
            xy = lines[i].split(',')
            endsize = len(xy[0])
            if endsize > 1: # ignore double quote i.e. endsize == 1
                # remove double quote at the end
                line += ' ' + xy[0][:endsize - 1] 
            x = line.split(' ')
        y = int(xy[1].rstrip())
        samples.append([x, y])
        i += 1
    return samples

def getcorpus():
    corpus = []
    fnames = ['reddit',
              'twitter']
    ext = '.csv'
    for fname in fnames:
        with open(fname + ext, 'r', encoding='latin-1') as f:
            corpus.extend(getsamples(f.readlines()))
    return corpus

corpus = getcorpus()
shuffle(corpus) # shuffle so the test set isn't just twitter
train = corpus[:int(len(corpus) * 0.8)]
test = corpus[int(len(corpus) * 0.8):]
classes = [-1, 0, 1] # negative, neutral, positive

from math import log2
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

class NaiveBayesClassifier:

    UNK = '<unk>'

    def __init__(self, C:list, vocab_size:int=5000, smoothing:float=0.05,
                 ignore_stopwords:bool=False):
        self.C = C
        self.vocab_size = vocab_size
        self.smoothing = smoothing
        self.ignore_stopwords = ignore_stopwords

    # ignore stopwords when building vocab
    # returns the vocab list
    def stopword_free_vocab(self, counts:Counter):
        V = []
        common_counts= counts.most_common()
        size, i = 0, 0
        while size < self.vocab_size:
            while common_counts[i][0] in stopwords.words('english') or len(common_counts[i][0]) == 0:
                i += 1
            V.append(common_counts[i][0])
            i += 1
            size += 1
        return V

    # see most likely words in vocab
    def common_class_words(self, class_index:int, num_words:int=10):
        vocab_likelihoods = [row[class_index] for row in self.loglikelihood]
        likely_word_indices = sorted(range(len(vocab_likelihoods)),
                                     key=lambda idx: vocab_likelihoods[idx],
                                     reverse=True)[:num_words]
        for j in range(num_words):
            w = self.V[likely_word_indices[j]] if likely_word_indices[j] < self.vocab_size else self.UNK
            print(' (%s, %f)' % (w, vocab_likelihoods[j]))

    def train(self, D:list):
        # get counts from a single document of all samples
        big = []
        for i in range(len(D)):
            big.extend(D[i][0])
        counts = Counter(big)
        
        # build vocabulary
        if self.ignore_stopwords:
            self.V = self.stopword_free_vocab(counts)
        else:
            self.V = [tup[0] for tup in counts.most_common(self.vocab_size)]
        
        # iterate over the classes, compute P(c) and P(w|c)'s for each class
        N_doc = len(D)
        self.logpriors = []
        self.bigdoc = []
        self.loglikelihood = [[] for _ in range(self.vocab_size + 1)] # extra row for UNK
        for i in range(len(self.C)):
            c = self.C[i]
            print('Training %d' % c)

            # build class document and calculate P(c)
            N_c = 0
            self.bigdoc.append([])
            for sample in D:
                if sample[1] == c:
                    N_c += 1
                    self.bigdoc[i].extend(sample[0])
            self.logpriors.append(log2(N_c / N_doc))
            print(' Counted %d samples, log P(c) = %f. Counting word occurrences...' % 
                  (N_c, self.logpriors[i]))
            
            # get word counts from class doc
            counter = Counter(self.bigdoc[i])
            # smoothed occurrences, vocab_size + 1 for UNK
            occurrences = sum(counter.values()) + self.smoothing * (self.vocab_size + 1)

            print(' Counted %d occurrences. Computing log P(w|c)s...' % occurrences)
            # get counts for every word in V
            for j in range(self.vocab_size): 
                w = self.V[j]
                c = counter[w]
                self.loglikelihood[j].append(log2((c + self.smoothing) / occurrences))

            # get sum of OOV word counts for UNK
            unk_c = 0
            for w, c in counter.most_common():
                if w in self.V:
                    continue
                unk_c += c
            self.loglikelihood[self.vocab_size].append(log2((unk_c + self.smoothing) / occurrences))

            # print most common words
            # self.common_class_words(i, 5)

    # get probability of each class, choose argmax class as prediction
    def test(self, testdoc):
        max_prob, max_c = float('-inf'), None
        for i in range(len(self.C)):
            class_prob = self.logpriors[i]
            for word in testdoc:
                # don't give stopwords UNK probability if ignoring
                if self.ignore_stopwords and word in stopwords.words('english'):
                    continue
                index = self.V.index(word) if word in self.V else self.vocab_size
                class_prob += self.loglikelihood[index][i]
            if class_prob > max_prob:
                max_prob = class_prob
                max_c = self.C[i]
        return max_c

# update the true positive (prediction, not sentiment) rates for each class
def evaluate(y, y_pred, tp, fp, tn, fn, tneu, fneu):
    if y == -1:
        if y_pred == -1:
            tn += 1
        else:
            fn += 1
    elif y == 0:
        if y_pred == 0:
            tneu += 1
        else:
            fneu += 1
    else:
        if y_pred == 1:
            tp += 1
        else:
            fp += 1
    return (tp, fp, tn, fn, tneu, fneu)

vocab_size = 5000
model = NaiveBayesClassifier(classes, vocab_size)
model.train(train)
print('Testing %d samples' % len(test))
tp, fp, tn, fn, tneu, fneu, count = 0, 0, 0, 0, 0, 0, 0
for sample in test:
    count += 1
    y = sample[1]
    y_pred = model.test(sample[0])
    tp, fp, tn, fn, tneu, fneu = evaluate(y, y_pred, tp, fp, tn, fn, tneu, fneu)
    if count % 5000 == 0:
        print(' %d samples, tpr %f tneur %f tnr %f' % (count, (tp / (tp + fp)),
                                                              (tneu / (tneu + fneu)),
                                                              (tn / (tn + fn))))

print('positive recall %%: %f' % (100 * tp / (tp + fp)))
print('neutral recall %%: %f' % (100 * tneu / (tneu + fneu)))
print('negative recall %%: %f' % (100 * tn / (tn + fn)))

Training -1
 Counted 34804 samples, log P(c) = -2.196412. Counting word occurrences...
 Counted 908207 occurrences. Computing log P(w|c)s...
Training 0
 Counted 54570 samples, log P(c) = -1.547557. Counting word occurrences...
 Counted 737328 occurrences. Computing log P(w|c)s...
Training 1
 Counted 70146 samples, log P(c) = -1.185305. Counting word occurrences...
 Counted 1851252 occurrences. Computing log P(w|c)s...
Testing 39880 samples
 5000 samples, tpr 0.849655 tneur 0.774688 tnr 0.735654
 10000 samples, tpr 0.845842 tneur 0.778969 tnr 0.722880
 15000 samples, tpr 0.847724 tneur 0.776567 tnr 0.729147
 20000 samples, tpr 0.849292 tneur 0.778674 tnr 0.728712
 25000 samples, tpr 0.850263 tneur 0.777230 tnr 0.727173
 30000 samples, tpr 0.850155 tneur 0.777898 tnr 0.725023
 35000 samples, tpr 0.851736 tneur 0.777918 tnr 0.725493
positive recall %: 85.145072
neutral recall %: 77.702203
negative recall %: 72.488934