from Numeric import * from pylab import * from scipy.linalg import * NONUMB = -999999.999; nGeneAnots=6 #Read file f = open('/home/lom/tmp/doublecheck_gsvd/Yeast.txt') firstLine=f.readline() lines=f.readlines() firstLine = firstLine.strip().split('\t') expNames = array(firstLine[nGeneAnots:], PyObject) nGenes = len(lines) nExps = len(expNames) data=ones((nGenes, nExps), 'f')*NONUMB geneNames=zeros(nGenes, PyObject) i=0; for line in lines: line = line.strip() cols = line.split('\t') geneNames[i] = cols[0]#cols[0:nGeneAnots] for j in range(nGeneAnots, len(cols)): if cols[j]!='': data[i][j-nGeneAnots]=float(cols[j]) i=i+1 f.close() #Count NaNs nNanPerExp=zeros([nExps,1]); for i in range(nExps): nNanPerExp[i] = sum(asarray(sum(asarray(data==NONUMB, 'i'),1)==i, 'i')) # sum(asarray(sum(data==NONUMB,1), 'i')==i) #Create and Display number of missing values bar(arange(nExps)-.5, nNanPerExp); axis([-.5, 3.5, 0, 3000]) xlabel('Number of Arrays') ylabel('Number of Genes') show() #Take the only rows without missing values idx = nonzero(sum(data!=NONUMB, 1)==nExps) fullmatrix = take(data, idx) fullgeneNames=take(geneNames, idx) #Calculate SVD [U, S, Vt] = svd(fullmatrix); fractions = S**2/sum(S**2); entropy = -sum(fractions*log(fractions))/log(nExps) #Plot barchart subplot(1,3,2); bar(range(nExps),fractions) title('(b) Expresion Fraction') axes([.45, .25, .16, .62]) bar(range(3,12), fractions[2:11]) #Plot rasterplot subplot(1,3,1); imshow(Vt, cmap=jet(), interpolation='nearest', vmin=-.3, vmax=.3); title('(a) Arrays') #plot two first eigengenes subplot(1,3,3); plot(range(nExps), Vt[0,:], '-ob',range(nExps), Vt[1,:], '-or' ); axis([0, nExps, -1, 1]) title('(c) Arrays') ylabel('Expression Level') show()