javascript - Parsing through CSV file to convert to JSON format file -
i given following csv file extracted excel spreadsheet. give background information of assistance, discusses agi numbers (think of protein identifiers), unmodified peptide sequences protein identifiers, , modified peptide sequences modifications made on unmodified sequences, index/indeces of modifications, , combined spectral count repeated peptides. text file called masp.glycomodreader.txt , information in following format below:
agi,unmd peptide (m) = x,mod peptide (om) = ox,index/indeces of modification,counts,combined spectral count repeated peptides at1g56070.1,nmsviahvdhgkstltdslvaaagiiaqevagdvr,nomsviahvdhgkstltdslvaaagiiaqevagdvr,2,17 at1g56070.1,lymearpmeeglaeaiddgr,lyomearpomeeglaeaiddgr,"3, 9",1 at1g56070.1,eamtplsefedkl,eaomtplsefedkl,3,7 at1g56070.1,lymearpmeeglaeaiddgr,lyomearpomeeglaeaiddgr,"3, 9",2 at1g56070.1,egplaeenmr,egplaeenomr,9,2 at1g56070.1,dlqddfmggaeiik,dlqddfomggaeiik,7,1
the output file needs result after extracting above in following format below:
at1g56070.1,{"peptides": [{"sequence": "nmsviahvdhgkstltdslvaaagiiaqevagdvr", "mod_sequence": "nomsviahvdhgkstltdslvaaagiiaqevagdvr" , "mod_indeces": 2, "spectral_count": 17}, {"sequence": "lymearpmeeglaeaiddgr" , "mod_sequence": "lyomearpomeeglaeaiddgr", "mod_indeces": [3, 9], "spectral_count": 3}, {"sequence": "eamtplsefedkl" , "mod_sequence": "eaomtplsefedkl", "mod_indeces": [3,9], "spectral_count": 7}, {"sequence": "egplaeenmr", "mod_sequence": "egplaeenomr", "mod_indeces": 9, "spectral_count": 2}, {"sequence": "dlqddfmggaeiik", "mod_sequence": "dlqddfomggaeiik", "mod_indeces": [7], "spectral_count": 1}]}
i have provided solution below: if has better solution in language or can possibly analyze mine , let me know if there more efficient methods of coming this, please comment below. thank you.
#!/usr/bin/env node var fs = require('fs'); var csv = require('csv'); var data ="proteins.csv"; /* uses csv nodejs module parse proteins.csv file. * parses csv file row row , updates peptide_arr. * new entries creates peptide object, similar entries updates * counts in peptide object same agi#. * uses peptide object store protein id agi#, , associated data. * writes formatted peptide objects txt file - output.txt. */ // tracks current row var x = 0; // array of peptide objects stores information csv file var peptide_arr = []; // csv module reads row row data csv() .from(data) .to('debug.csv') .transform(function(row, index) { // first entry push new peptide object agi# (row[0]) if(x == 0) { // cur current peptide read row csv module peptide cur = new peptide( row[0] ); // add assoicated data row (1-5) cur cur.data.peptides.push({ "sequence" : row[1]; "mod_sequence" : row[2]; if(row[5]){ "mod_indeces" : "[" + row[3] + ", " + row[4] + "]"; "spectral_count" : row[5]; } else { "mod_indeces" : row[3]; "spectral_count" : row[4]; } }); // add current peptide array peptide_arr.push(cur); } // move next row x++; }); // loop through peptide_arr , append output each peptide's agi# , data string output = ""; for(var peptide in peptide_arr) { output = output + peptide.tostring() } // write output output.txt fs.writefile("output.txt", output); /* peptide object : * - id:agi# * - data: json array associated */ function peptide(id) // actual function id retrieving , data // storage { this.id = id; this.data = { peptides: [] }; } /* peptide methods : * - tojson : returns formatted string */ peptide.prototype = { tostring: function(){ return this.id + "," + json.stringify(this.data, null, " ") + "/n" } };
edited note: seems when run solution posted, getting memory leak error; infinitely running while not producing substantial, readable output. if willing assist in assessing why occurring, great.
does version work? looks ever create 1 peptide object. also, "if(row[5])" statement doing? in example data there 5 elements. also, mod_indeces supposed list, correct? because in example output file mod_indeces isn't list in first peptide. anyway, here came in python:
import csv import json data = {} open('proteins.csv','rb') f: reader = csv.reader(f) row in reader: name = row[0] sequence = row[1] mod_sequence = row[2] mod_indeces = map(int,row[3].split(', ')) spectral_count = int(row[4]) peptide = {'sequence':sequence,'mod_sequence':mod_sequence, 'mod_indeces':mod_indeces,'spectral_count':spectral_count} if name in data: data[name]['peptides'].append(peptide) else: data[name] = {'peptides':[peptide]} f.close() f = open('output.txt','wb') protein in data: f.write(protein) f.write(',') f.write(json.dumps(data[protein])) f.write('\n') f.close()
if on windows , want view file plain text, may want replace '\n' '\r\n' or os.linesep.
if want skip rows (if there header or something), can this:
import csv import json data = {} rows_to_skip = 1 rows_read = 0 open('proteins.csv','rb') f: reader = csv.reader(f) row in reader: if rows_read >= rows_to_skip: name = row[0] sequence = row[1] mod_sequence = row[2] mod_indeces = map(int,row[3].split(', ')) spectral_count = int(row[4]) peptide = {'sequence':sequence,'mod_sequence':mod_sequence, 'mod_indeces':mod_indeces,'spectral_count':spectral_count} if name in data: data[name]['peptides'].append(peptide) else: data[name] = {'peptides':[peptide]} rows_read += 1 f.close() f = open('output.txt','wb') protein in data: f.write(protein) f.write(',') f.write(json.dumps(data[protein])) f.write('\n') f.close()
if want keys of dictionary in particular order, can use ordereddict instead of default dict. replace peptide line following:
peptide = ordereddict([('sequence',sequence), ('mod_sequence',mod_sequence), ('mod_indeces',mod_indeces), ('spectral_count',spectral_count)])
now order preserved. is, sequence
followed mod_sequence
followed mod_indeces
followed spectral_count
. change order, change order of elements in ordereddict.
note have add from collections import ordereddict
in order able use ordereddict.
Comments
Post a Comment