-- F.Damberger -- 14.Feb.2005 -- purpose: read BMRB statistics from a file and update -- all ResidueTypes to reflect those statistics. -- The statistical average and standard deviations will -- be read from the file and replace the relevant values -- in the ResidueType definitions of the loaded repository. -- The DateStamp from the Statistics file will be added -- as attribute to each ResidueType which is updated. -- Only ResidueTypes will be replaced where EVERY atom of -- the StandardResidueType has an entry in the statistics file! -- StandardResidueType contains the largest number of atoms -- for a given residue -- ARG+ : for ARG -- LYS+ : for LYS -- ASP : for ASP- -- GLU : for GLU- -- CYS : for CYSS -- HIS+ : for HIS,HIST -- The statistics file should be a copy-paste of the entire www page of the BMRB statistics -- Step-by-Step -- go to www.bmrb.wisc.edu -- click on "NMR Statistics" in navigator on the left side of the main page -- Go down the page to section "static lists" (which are actually updated weekly) -- In the subsection protein to the right of "Restricted Set (diamagnetic only)" click on the link "ASCII Version" -- as of this writing, this procedure loads the page: -- http://www.bmrb.wisc.edu/ftp/pub/bmrb/statistics/chem_shifts/selected/statsel_prot.txt -- copy-paste the entire page into a file and name it "bmrbdia-2016-08-23.stats" -- where 2016-08-23 is replaced by numbers representing YEAR-MO-DAY indicated at the top of the file to the right of -- "Last updated: " - this is not essential, just useful to recognize whats in the file later. -- BMRB now provides a chemical shift statistics file at fixed location which has simple csv format, but this was introduce later -- and is not as easy on the eye as the webpage format, so I stick to this. -- To see the DateStamp, you will have to define the -- Attribute in the Cara-Explorer under -- Attribute Definitions - ResidueType -- Right-Click in the right window and select "new attribute" -- Enter the name "Updated-MM-DD-YYYY" and the type "memo". -- You can leave the description field blank. -- After running the script, you should see the attribute -- if you click on ResidueType in the CARA-Explorer and -- then right-click in the right window and select "Open Object Table". -- FD Feb.14.2005 -- ver. 2 -- FD Aug.23.2016 -- BMRB changed the format of the date stamp from 12-31-2016 to 12/31/2016 so I revised the function FindDateStamp to read the new format version = "2 (Aug. 23, 2016)" -- for the future: --Currently, I compute the ave and dev statistics for the Generic residueType (BB) by averaging across the ResidueTypes manually -- and enter these for each atom of the BB ResidueType. -- In the future this script could be modified to update of default backbone ResidueType (BB) shifts by averaging the corresponding -- statistics across all the ResidueTypes. -- Currently, in CARAs object model there is no way to define different generic ResidueTypes for different molecule classes -- (like protein and RNA) so I will not implement this yet since as things stand I would have to hardwire the averaging -- based on the names of the ResiduesTypes to avoid averaging statistics from different molecule types (such as from protein and RNA). -- It is not very critical since the backbone shifts are based on very large statistics and don't change much anymore. -- FUNCTION FindDateStamp -------------------------------------------------------------------------- function FindDateStamp( StatFile ) DateStampFound = false DateStampSearch = true while DateStampSearch do local Line = StatFile:read() if Line ~= nil then -- not end of file -- Scan the line ----- StartChar,EndChar,Month,Day,Year = string.find( Line, "Last updated: (%d+)%/(%d+)%/(%d+)" ) if StartChar then DateStampFound = true DateStampSearch = false return Month,Day,Year end else -- EOF DateStampSearch = false NotEof = false error( "Date Stamp not found at start of file: e.g. Last updated: 01-07-2005" ) return -- Return nothing, no DateStamp found end --if Line not nil end -- scan of file until EOF or DateStampFound end --function -- FUNCTION FindStatisticsLine ------------------------------------------------------------------------------------------------------------------------------------ function FindStatisticsLine( Statfile ) ScanForStatisticsLine = true StatisticsLineFound = false StartChar = false while ScanForStatisticsLine do local Line = StatFile:read() if Line ~= nil then -- Scan the Line StartChar,EndChar,Res,AtomName,AtomType,NumShifts,AveShift,DevShift = string.find( Line, "(%u+)%s+(%u+%d*)%s+(%u+)%s+(%d+)%s+%-?%d+%.%d+%s+%-?%d+%.%d+%s+(%-?%d+%.%d+)%s+(%-?%d+%.%d+)" ) if StartChar then return false,Res,AtomName,AtomType,AveShift,DevShift end else ScanForStatisticsLine = false -- EOF reached return true -- EOF is true end if StartChar then StatisticsLineFound = true ScanForStatisticsLine = false end end -- scan for statistics line end -- function -- FUNCTION AllLabelsInResidueType ---------------------------------------------------------------------------------------------------------------------- function AllLabelsInResidueType( ResidueType, Stats, NumLines, NeasyNomenclature ) Atoms = ResidueType:getAtoms() AllLabelsFound = true for AtomLabel,Atom in pairs( Atoms ) do AtomName = Atom:getName() AtomType = Atom:getAtomType() if NeasyNomenclature then AtomName = ConvertNeasyToBmrbLabel( ResidueType, Atom ) end MissingLabel = true if Atom:getValue() then for i = 1,NumLines do -- scan through all Stats AtomMatchFound = false if ThreeLetterName( ResidueType:getShort() ) == Stats.ResName[ i ] and AtomType == Stats.AtomType[ i ] and AtomName == Stats.AtomName[ i ] then AtomMatchFound = true end if AtomMatchFound then MissingLabel = false end end -- scan of Stats else MissingLabel = false -- ignore Atoms with no shifts defined in Repository e.g. Oxygen, Sulfur... end -- if Atom has Shift defined if MissingLabel == true then print("No match for "..ResidueType:getShort()..":"..AtomName) AllLabelsFound = false end end -- scan of Atoms in ResidueType return AllLabelsFound end --Function -- FUNCTION ConvertNeasyToBmrbLabel ---------------------------------------------------------------------------------------------------------------------- -- This function converts the deviating labels in Neasy nomenclature to Bmrb nomenclature -- e.g. BMRB: H Neasy: HN (Amide protons are named "HN" in Neasy, and "H" in Bmrb (IUPAC) nomenclature) -- BMRB: HG Neasy: QG (pseudoatoms have name "Q" in Neasy, the "Q" is replaced with "H" in Bmrb nomenclature -- BMRB: GLY HA3 Neasy: GLY HA1 (GLY alphas are named HA1 and HA2 in Neasy, and HA3 and HA2 in Bmrb (IUPAC) nomenclature) function ConvertNeasyToBmrbLabel( ResidueType, Atom ) AtomName = Atom:getName() AtomType = Atom:getAtomType() if ResidueType:getShort() == "GLY" and AtomType == "H" and AtomName == "HA1" then -- BMRB name HA3 AtomName = "HA3" -- rename it for internal purposes to the expected name in the BMRB database elseif AtomType == "H" and AtomName == "HN" then -- BMRB name H AtomName = "H" -- rename it for internal purposes to the expected name in the BMRB database elseif AtomType == "H" and IsPseudoAtom( AtomName ) then -- BMRB replaces Q with H AtomName = BmrbPseudoAtom( AtomName ) end return AtomName end -- function -- FUNCTION IsPseudoAtom ---------------------------------------------------------------------------------------------------------------------- -- This function checks whether the AtomName contains "Q" and returns true if it does function IsPseudoAtom( AtomName ) -- checks for "Q" in the name StartChar,EndChar,FirstChar,RestChars = string.find( AtomName, "(%u)(%w+)" ) if FirstChar == "Q" then return true else return false end end -- function -- FUNCTION BmrbPseudoAtom ---------------------------------------------------------------------------------------------------------------------- -- This function converts a NEASY pseudoatom to BMRB nomenclature by replacing Q with H function BmrbPseudoAtom( AtomName ) StartChar,EndChar,FirstChar,RestChars = string.find( AtomName, "(%u)(%w+)" ) return "H"..RestChars end -- FUNCTION DateStampResidueType ---------------------------------------------------------------------------------------------------------------- -- This function sets a new attribute for ResidueType "Updated-MM-DD-YYY" -- to be equal to the DateStamp read from header of BMRB stats file -- This documents which ppm values were used to update from BMRB -- It also sets an attribute documenting what the SD values were -- multiplied by (SDmult) to obtain the dev values (usually 4.0) function DateStampResidueType( ResidueType, Month, Day, Year, SDmult ) ResidueType:setAttr( "Updated-MM-DD-YYYY",Month.."-"..Day.."-"..Year) ResidueType:setAttr("SDmult", SDmult ) end -- FUNCTION RemoveAtomShifts ------------------------------------------------------------------------------------------------------------------ -- This function removes atom shifts for atoms which should not have shifts -- like Oxygen atoms. This corrects an error in the template Start1.1.cara function RemoveAtomShifts( AtomType ) ResidueTypes = cara:getResidueTypes() for ResidueTypeId,ResidueType in pairs( ResidueTypes ) do Atoms = ResidueType:getAtoms() for AtomID,Atom in pairs( Atoms ) do if Atom:getAtomType() == AtomType and Atom:getValue() then print( "Removing Atom Shift:"..ResidueType:getShort()..":"..Atom:getName() ) ResidueType:setValue( Atom ) end end end -- for end -- function -- FUNCTION UpdateStatsInResidueType ------------------------------------------------------------------------------------------------------------ -- This function is at the heart of the script. -- This function updates the shifts to the values from the BMRB file function UpdateStatsInResidueType( ResidueType, SDmult, Stats ) print( "\n-------------------- Updating Stats in Residue:"..ResidueType:getShort().."--------------------------" ) Atoms = ResidueType:getAtoms() for AtomLabel,Atom in pairs( Atoms ) do AtomName = ConvertNeasyToBmrbLabel( ResidueType, Atom ) for i=1,NumLines do -- scan through all Stats AtomMatchFound = false if ThreeLetterName( ResidueType:getShort() ) == Stats.ResName[ i ] and Atom:getAtomType() == Stats.AtomType[ i ] and AtomName == Stats.AtomName[ i ] then AtomMatchFound = true end if not Atom:getValue() then --skip it, no shift expected else if AtomMatchFound then StatsAveShift = string.format( "%7.3f", Stats.AveShift[ i ] ) SDtimesDevShift = string.format( "%7.3f", SDmult*Stats.DevShift[ i ] ) AtomShift,AtomDev = Atom:getValue() AtomShift = string.format( "%7.3f", AtomShift ) AtomDev = string.format( "%7.3f", AtomDev ) AtomNameForm = string.format( "%-6s", AtomName ) print( "Updating "..ResidueType:getShort()..":"..AtomNameForm.." from "..AtomShift.." +- "..AtomDev.." to "..StatsAveShift.." +- "..SDtimesDevShift ) -- update stats: NEXT LINE does the actual updating of the repository statistics ResidueType:setValue( Atom, Stats.AveShift[ i ], SDmult*Stats.DevShift[ i ] ) end end --if Shift exists end -- scan of Stats end -- scan of Atoms in ResidueType end -- FUNCTION StandardResidueType------------------------------------------------------------------------------------------------------------ function StandardResidueType( ResidueType, NeasyNomenclature ) -- function points to the "StandardResidueType for each ResidueType -- This is because the BRMB database does not distinguish the different states of residues (like ionization states, tautomers, oxidation) -- Therefore the script replaces the statistics for each ResidueType with those of the StandardResidueType -- e.g. the ResidueTypes CYS and CYSS will have the same shifts (even for the CB spins which vary significantly between CYSS(red) and CYS(ox) ) -- Blame BMRB for this, It's not my decision. -- e.g. CYS for CYSS -- ARG+ for ARG -- LYS+ for LYS -- GLU- for GLU -- ASP- for ASP -- HIS+ for HIS and HIST etc.. IUPACRES = { "ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "PHE", "GLN", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "PRO", "SER", "THR", "TRP", "TYR", "VAL" } Type = ResidueType:getShort() if NeasyNomenclature then if Type == "CYSS" then Type = "CYS" end if Type == "ASP" then Type = "ASP-" end if Type == "GLU" then Type = "GLU-" end if Type == "HIS" then Type = "HIS+" end if Type == "HIST" then Type = "HIS+" end if Type == "LYS" then Type = "LYS+" end if Type == "ARG" then Type = "ARG+" end else -- BMRB nomenclature expected IUPAC = false for Index,AllowedResName in pairs( IUPACRES ) do if Type == AllowedResName then IUPAC = true end end if not IUPAC then print( "\n-------------------- Non IUPAC residue name: "..Type.."--------------------------" ) --print( "Non IUPAC residue name: "..Type ) end end -- if NeasyNomenclature StandardType = cara:getResidueType( Type ) return StandardType end -- function StandardResidueType function ThreeLetterName( ResidueTypeName ) -- change name to three letter name , BMRB only uses the standard three letter name -- e.g. Neasy and Cara template use ARG+ to mean the protonated arginine, BMRB uses ARG StartChar,EndChar,FirstThreeChars = string.find( ResidueTypeName, "(%u%u%u)" ) return FirstThreeChars end -- Main Body of Program ------------------------------------------------------------------------------------------------------------------------------- --0. Get Format of Repository ResidueTypes (Neasy or Bmrb nomenclature) -- e.g. Amide Protons: Neasy "HN" / BMRB "H" RepositoryFormat = dlg.getSymbol("Select Repository format Bmrb or Neasy","Which Format are the Repository Labels? e.g. if Amide protons are labeled H/HN you have BMRB/Neasy format","BMRB","Neasy") if RepositoryFormat == "Neasy" then NeasyNomenclature = true else NeasyNomenclature = false end --1. Get BMRB Statistics File Location local StatFileLocation = dlg.getOpenFileName( "Select the BRMB statistics file", "*.stats" ) StatFile = io.open( StatFileLocation ) NotEof = true --2. Get DateStamp Month,Day,Year = FindDateStamp( StatFile ) DateStamp = Month.."-"..Day.."-"..Year if Month == nil then error("No Date Stamp found. Read in aborted") end print( "MM-DD-YEAR = "..Month.."-"..Day.."-"..Year ) -- create array for statistics: Stats = {} Stats.ResName = {} Stats.AtomName = {} Stats.AtomType = {} Stats.AveShift = {} Stats.DevShift = {} -- Read in Statistics NumLines = 0 Eof = false while not Eof do Eof,ResName,AtomName,AtomType,AveShift,DevShift = FindStatisticsLine( StatFile ) if ResName then NumLines = NumLines + 1 print(ResName,AtomName,AtomType,AveShift,DevShift) -- READ STATS DATA INTO ARRAY Stats.ResName[ NumLines ] = ResName Stats.AtomName[ NumLines ] = AtomName Stats.AtomType[ NumLines ] = AtomType Stats.AveShift[ NumLines ] = AveShift Stats.DevShift[ NumLines ] = DevShift end end print( "read in "..NumLines.." statistics lines" ) --3. Get the multiplier for the dev value from the user (default is 4.0) SDmult = dlg.getText("Enter multiplier for Standard Deviations","SD multiplier","4.0") StartChar = nil StartChar,EndChar,SDmultParse = string.find( SDmult, "(%d+%.%d+)" ) if StartChar == nil then error("Please enter a floating point number like 4.0") end --4. ACTUAL WORK PART OF SCRIPT : Loop through ResidueTypes and replace the old Ave/Dev with the new. -- remove shifts for O and S -- this fixes an error in Start1.1.cara print( "removing shifts for O and S -- this fixes an error in Start1.1.cara" ) RemoveAtomShifts( "O" ) RemoveAtomShifts( "S" ) -- get ResidueTypes and update their shifts ResidueTypes = cara:getResidueTypes() -- print( "Not Replaced / Replaced" ) for ResidueTypeName,ResidueType in pairs( ResidueTypes ) do if AllLabelsInResidueType( StandardResidueType( ResidueType, NeasyNomenclature ), Stats, NumLines, NeasyNomenclature ) then DateStampResidueType( ResidueType, Month, Day, Year, SDmult ) UpdateStatsInResidueType ( ResidueType, SDmult, Stats ) -- print("------------- / -----"..ResidueType:getShort().."-----" ) else print( "-----"..ResidueType:getShort().."----- / -------------" ) blank= 0 -- dummy line end end -- Report which ResidueTypes were updated: print( "----------------------------" ) print( "DateStamp for BMRB file: "..DateStamp ) print( "Summary of Updated Residues:" ) print( "Res Updated-MM-DD-YYYY" ) for ResidueTypeName,ResidueType in pairs( ResidueTypes ) do if ResidueType:getAttr("Updated-MM-DD-YYYY") then if DateStamp == ResidueType:getAttr("Updated-MM-DD-YYYY") then CurrentStatusString = "up to date" else CurrentStatusString = "NOT UP TO DATE!" end FormattedResTyp = string.format( "%-5s", ResidueType:getShort() ) print( FormattedResTyp.." "..ResidueType:getAttr( "Updated-MM-DD-YYYY").." "..CurrentStatusString ) else print( ResidueType:getShort().." never updated." ) end end print("LoadBmrbStats version "..version.." is all done") -- end of Main Body