I have two files. One creates a numpy array in compressed sparse row format
from sklearn.feature_extraction.text import TfidfTransformer
import pdb
def stem_document(document):
translatedict = ""
stemmer = PorterStemmer()
for word in string.punctuation:
translatedict = translatedict + word
doc_stemmed = []
for word in document.split():
lowerstrippedword = ''.join(c for c in word.lower() if c not in translatedict)
try:
stemmed_word = stemmer.stem(lowerstrippedword)
doc_stemmed.append(stemmed_word)
except:
print lowerstrippedword + " could not be stemmed."
return ' '.join(doc_stemmed)
def readFileandStem(filestring):
with open(filestring, 'r') as file:
reader = csv.reader(file)
file_extras = []
vector_data = []
error = False
while (error == False):
try:
next = reader.next()
if len(next) == 3 and next[2] != "":
document = next[2]
stemmed_document = stem_document(document)
vector_data.append(stemmed_document)
file_extra = []
file_extra.append(next[0])
file_extra.append(next[1])
file_extras.append(file_extra)
except:
error = True
return [vector_data, file_extras]
filestring = 'Data.csv'
print "Reading File"
data = readFileandStem(filestring)
documents = data[0]
file_extras = data[1]
print "Vectorizing Data"
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(documents)
tf_idf_transform = TfidfTransformer(use_idf=False).fit(matrix)
tf_idf_matrix = tf_idf_transform.transform(matrix)
with open('matrix/matrix.npy', 'w') as matrix_file:
np.save(matrix_file, tf_idf_matrix)
file_json_map = {}
file_json_map['extras'] = file_extras
with open('matrix/extras.json', 'w') as extras_file:
extras_file.write(json.dumps(file_json_map))
print "finished"
The next file is supposed to load the same file...
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
import json
import pdb
with open('matrix/matrix.npy', 'r') as matrix_file:
matrix = np.load(matrix_file)
hcluster = linkage(matrix, "complete")
However, I get the following error:
File "Cluster.py", line 7, in <module>
matrix = np.load(matrix_file)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\npyio.py", line 406, in load
pickle_kwargs=pickle_kwargs)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 620, in read_array
version = read_magic(fp)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 216, in read_magic
raise ValueError(msg % (MAGIC_PREFIX, magic_str[:-2]))
ValueError: the magic string is not correct; expected '\x93NUMPY', got '\x00\x00I\x1c\x00\x00'
I don't know why the magic string would be incorrect because from what I've looked into, all .npy files are supposed to have the same magic string "\x93NUMPY".
Ideas?
with open(blahblah) as matrix_file. just trynp.load(blahblah)