Question & Answer: Why is my accuracy score so poor? The database is from a JMU Sports forum. I put in pymongo. I dont understand w…..

Why is my accuracy score so poor? The database is from a JMU Sports forum. I put in pymongo. I dont understand why my accuracy is so low. I basically extracted it put it in a list. Made a new list where it takes the top 80 most prolific authors.

from pymongo import MongoClient
db = MongoClient().db_name.collection_name
users = []
for user in db.find():
  users.append(user)
print(users[1])
{'_id': 14069840, 'page': 1, 'post_id': 1465, 'post': 'I laughed too hard. Whoops.', 'username': 'Potomac', 'timestamp': '2017-02'}
from collections import Counter
pr = Counter([review['username'] for review in users]).most_common(80)
pr[:1]
[('BleedingPurple', 7619)]

keep_ids = {pr[0] : 0 for pr in prolific_reviewers}

keep_reviews = []

for review in reviews:

uid = review[‘user_id’]

if uid in keep_ids and keep_ids[uid] < 500:

keep_reviews.append(review)

keep_ids[uid] += 1

authors = [review['username'] for review in keep_reviews]
text = [review['post'] for review in keep_reviews]

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts)
print(vectors.shape)
(146816, 68140)
X_train, X_test, y_train, y_test = train_test_split(vectors, authors, test_size=0.2, random_state=1337)
print(X_train.shape, X_test.shape)
(117452, 68140) (29364, 68140)
svm = LinearSVC()
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))
0.18

Expert Answer

 

from pymongo import MongoClient
db = MongoClient().db_name.collection_name
users = []
for user in db.find():
users.append(user)
print(users[1])
{‘_id’: 14069840, ‘page’: 1, ‘post_id’: 1465, ‘post’: ‘I laughed too hard. Whoops.’, ‘username’: ‘Potomac’, ‘timestamp’: ‘2017-02’}
from collections import Counter
pr = Counter([review[‘username’] for review in users]).most_common(80)
pr[:1]
[(‘BleedingPurple’, 7619)]

allpost1 = []
for c in range(len(pr)):
posts = [post for post in users if post[‘username’] == pr[c][0]]
random.shuffle(posts)
for x in range(len(posts)//2, len(posts)):
posts[x][‘username’] = pr[c][0] + “‘”
for post in posts:
allpost1.append(post)

len(allpost1)
146816

z=allpost1

authors = [review[‘username’] for review in z]
text = [review[‘post’] for review in z]

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts)
print(vectors.shape)
(146816, 68140)
X_train, X_test, y_train, y_test = train_test_split(vectors, authors, test_size=0.2, random_state=1337)
print(X_train.shape, X_test.shape)
(117452, 68140) (29364, 68140)
svm = LinearSVC()
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))
0.18

Still stressed from student homework?
Get quality assistance from academic writers!