javascript - Parsing through CSV file to convert to JSON format file -


i given following csv file extracted excel spreadsheet. give background information of assistance, discusses agi numbers (think of protein identifiers), unmodified peptide sequences protein identifiers, , modified peptide sequences modifications made on unmodified sequences, index/indeces of modifications, , combined spectral count repeated peptides. text file called masp.glycomodreader.txt , information in following format below:

agi,unmd peptide (m) = x,mod peptide (om) = ox,index/indeces of modification,counts,combined  spectral count repeated peptides  at1g56070.1,nmsviahvdhgkstltdslvaaagiiaqevagdvr,nomsviahvdhgkstltdslvaaagiiaqevagdvr,2,17 at1g56070.1,lymearpmeeglaeaiddgr,lyomearpomeeglaeaiddgr,"3, 9",1 at1g56070.1,eamtplsefedkl,eaomtplsefedkl,3,7 at1g56070.1,lymearpmeeglaeaiddgr,lyomearpomeeglaeaiddgr,"3, 9",2 at1g56070.1,egplaeenmr,egplaeenomr,9,2 at1g56070.1,dlqddfmggaeiik,dlqddfomggaeiik,7,1 

the output file needs result after extracting above in following format below:

at1g56070.1,{"peptides": [{"sequence": "nmsviahvdhgkstltdslvaaagiiaqevagdvr", "mod_sequence":     "nomsviahvdhgkstltdslvaaagiiaqevagdvr" , "mod_indeces": 2, "spectral_count": 17}, {"sequence":  "lymearpmeeglaeaiddgr" , "mod_sequence": "lyomearpomeeglaeaiddgr", "mod_indeces": [3, 9],  "spectral_count": 3}, {"sequence": "eamtplsefedkl" , "mod_sequence": "eaomtplsefedkl",  "mod_indeces": [3,9], "spectral_count": 7}, {"sequence": "egplaeenmr", "mod_sequence":  "egplaeenomr", "mod_indeces": 9, "spectral_count": 2}, {"sequence": "dlqddfmggaeiik",  "mod_sequence": "dlqddfomggaeiik", "mod_indeces": [7], "spectral_count": 1}]} 

i have provided solution below: if has better solution in language or can possibly analyze mine , let me know if there more efficient methods of coming this, please comment below. thank you.

    #!/usr/bin/env node      var fs = require('fs');     var csv = require('csv');     var data ="proteins.csv";      /* uses csv nodejs module parse proteins.csv file.     * parses csv file row row , updates peptide_arr.     * new entries creates peptide object, similar entries updates     * counts in peptide object same agi#.     * uses peptide object store protein id agi#, , associated data.     * writes formatted peptide objects txt file - output.txt.     */      // tracks current row     var x = 0;     // array of peptide objects stores information csv file     var peptide_arr = [];      // csv module reads row row data      csv()     .from(data)     .to('debug.csv')     .transform(function(row, index) {         // first entry push new peptide object agi# (row[0])          if(x == 0) {         // cur current peptide read row csv module         peptide cur = new peptide( row[0] );          // add assoicated data row (1-5) cur         cur.data.peptides.push({             "sequence" : row[1];             "mod_sequence" : row[2];             if(row[5]){             "mod_indeces" : "[" + row[3] + ", " + row[4] + "]";             "spectral_count" : row[5];               } else {             "mod_indeces" : row[3];             "spectral_count" : row[4];               }         });          // add current peptide array         peptide_arr.push(cur);         }          // move next row         x++;     });      // loop through peptide_arr , append output each peptide's agi# , data     string output = "";     for(var peptide in peptide_arr)      {         output = output + peptide.tostring()     }     // write output output.txt     fs.writefile("output.txt", output);      /* peptide object :      *  - id:agi#      *  - data: json array associated      */     function peptide(id) // actual function id retrieving , data                          // storage {     this.id = id;     this.data = {         peptides: []     }; }  /* peptide methods :  *  - tojson : returns formatted string  */ peptide.prototype = {     tostring: function(){         return this.id + "," + json.stringify(this.data, null, " ") + "/n"     } }; 

edited note: seems when run solution posted, getting memory leak error; infinitely running while not producing substantial, readable output. if willing assist in assessing why occurring, great.

does version work? looks ever create 1 peptide object. also, "if(row[5])" statement doing? in example data there 5 elements. also, mod_indeces supposed list, correct? because in example output file mod_indeces isn't list in first peptide. anyway, here came in python:

import csv import json data = {} open('proteins.csv','rb') f:     reader = csv.reader(f)     row in reader:         name = row[0]         sequence = row[1]         mod_sequence = row[2]         mod_indeces = map(int,row[3].split(', '))         spectral_count = int(row[4])         peptide = {'sequence':sequence,'mod_sequence':mod_sequence,                    'mod_indeces':mod_indeces,'spectral_count':spectral_count}         if name in data:             data[name]['peptides'].append(peptide)         else:             data[name] = {'peptides':[peptide]}     f.close()  f = open('output.txt','wb') protein in data:     f.write(protein)     f.write(',')     f.write(json.dumps(data[protein]))     f.write('\n') f.close() 

if on windows , want view file plain text, may want replace '\n' '\r\n' or os.linesep.

if want skip rows (if there header or something), can this:

import csv import json data = {} rows_to_skip = 1 rows_read = 0 open('proteins.csv','rb') f:     reader = csv.reader(f)     row in reader:         if rows_read >= rows_to_skip:             name = row[0]             sequence = row[1]             mod_sequence = row[2]             mod_indeces = map(int,row[3].split(', '))             spectral_count = int(row[4])             peptide = {'sequence':sequence,'mod_sequence':mod_sequence,                        'mod_indeces':mod_indeces,'spectral_count':spectral_count}             if name in data:                 data[name]['peptides'].append(peptide)             else:                 data[name] = {'peptides':[peptide]}         rows_read += 1     f.close()  f = open('output.txt','wb') protein in data:     f.write(protein)     f.write(',')     f.write(json.dumps(data[protein]))     f.write('\n') f.close() 

if want keys of dictionary in particular order, can use ordereddict instead of default dict. replace peptide line following:

peptide = ordereddict([('sequence',sequence),                        ('mod_sequence',mod_sequence),                        ('mod_indeces',mod_indeces),                        ('spectral_count',spectral_count)]) 

now order preserved. is, sequence followed mod_sequence followed mod_indeces followed spectral_count. change order, change order of elements in ordereddict.

note have add from collections import ordereddict in order able use ordereddict.


Comments

Popular posts from this blog

javascript - DIV "hiding" when changing dropdown value -

Does Firefox offer AppleScript support to get URL of windows? -

android - How to install packaged app on Firefox for mobile? -