Why is my accuracy score so poor? The database is from a JMU Sports forum. I put in pymongo. I dont understand why my accuracy is so low. I basically extracted it put it in a list. Made a new list where it takes the top 80 most prolific authors.
from pymongo import MongoClient db = MongoClient().db_name.collection_name users = [] for user in db.find(): users.append(user) print(users[1]) {'_id': 14069840, 'page': 1, 'post_id': 1465, 'post': 'I laughed too hard. Whoops.', 'username': 'Potomac', 'timestamp': '2017-02'} from collections import Counter pr = Counter([review['username'] for review in users]).most_common(80) pr[:1] [('BleedingPurple', 7619)]
keep_ids = {pr[0] : 0 for pr in prolific_reviewers}
keep_reviews = []
for review in reviews:
uid = review[‘user_id’]
if uid in keep_ids and keep_ids[uid] < 500:
keep_reviews.append(review)
keep_ids[uid] += 1
authors = [review['username'] for review in keep_reviews] text = [review['post'] for review in keep_reviews] from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.model_selection import train_test_split vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(texts) print(vectors.shape) (146816, 68140) X_train, X_test, y_train, y_test = train_test_split(vectors, authors, test_size=0.2, random_state=1337) print(X_train.shape, X_test.shape) (117452, 68140) (29364, 68140) svm = LinearSVC() svm.fit(X_train, y_train) predictions = svm.predict(X_test) from sklearn.metrics import accuracy_score print(accuracy_score(y_test, predictions)) 0.18
Expert Answer
from pymongo import MongoClient
db = MongoClient().db_name.collection_name
users = []
for user in db.find():
users.append(user)
print(users[1])
{‘_id’: 14069840, ‘page’: 1, ‘post_id’: 1465, ‘post’: ‘I laughed too hard. Whoops.’, ‘username’: ‘Potomac’, ‘timestamp’: ‘2017-02’}
from collections import Counter
pr = Counter([review[‘username’] for review in users]).most_common(80)
pr[:1]
[(‘BleedingPurple’, 7619)]
allpost1 = []
for c in range(len(pr)):
posts = [post for post in users if post[‘username’] == pr[c][0]]
random.shuffle(posts)
for x in range(len(posts)//2, len(posts)):
posts[x][‘username’] = pr[c][0] + “‘”
for post in posts:
allpost1.append(post)
len(allpost1)
146816
z=allpost1
authors = [review[‘username’] for review in z]
text = [review[‘post’] for review in z]
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts)
print(vectors.shape)
(146816, 68140)
X_train, X_test, y_train, y_test = train_test_split(vectors, authors, test_size=0.2, random_state=1337)
print(X_train.shape, X_test.shape)
(117452, 68140) (29364, 68140)
svm = LinearSVC()
svm.fit(X_train, y_train)
predictions = svm.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))
0.18