|  | OpenMS
    2.6.0
    | 
 
 
  
  
 
Go to the documentation of this file.
   51 #include <boost/regex.hpp> 
   81 template<
typename TBackend>
 
  106     f_.readStart(FASTA_file);
 
  112     return chunk_offset_;
 
  123     chunk_offset_ += data_fg_.size();
 
  124     data_fg_.swap(data_bg_);
 
  126     return !data_fg_.empty();
 
  138     data_bg_.reserve(suggested_size);
 
  140     for (
int i = 0; i < suggested_size; ++i)
 
  142       std::streampos spos = f_.position();
 
  143       if (!f_.readNext(p)) 
break;
 
  144       data_bg_.push_back(std::move(p));
 
  145       offsets_.push_back(spos);
 
  147     return !data_bg_.empty();
 
  153     return data_fg_.size();
 
  165     return data_fg_[pos];
 
  184     if (chunk_offset_ <= pos && pos < chunk_offset_ + chunkSize())
 
  186       protein = data_fg_[pos - chunk_offset_];
 
  190     if (pos >= offsets_.size())
 
  194     std::streampos spos = f_.position(); 
 
  195     if (!f_.setPosition(offsets_[pos])) 
return false;
 
  196     bool r = f_.readNext(protein);
 
  197     f_.setPosition(spos); 
 
  204     return f_.atEnd() && offsets_.empty();
 
  225     return offsets_.size();
 
  270     if (!activate_count_)
 
  309     protein = data_[pos];
 
  316     return data_.empty();
 
  333   const std::vector<FASTAFile::FASTAEntry>& 
data_; 
 
  334   int activate_count_ = 0;
 
  335   int cache_count_ = 0;
 
  363     const std::vector<std::string> affixes{ 
"decoy", 
"dec", 
"reverse", 
"rev", 
"reversed", 
"__id_decoy", 
"xxx", 
"shuffled", 
"shuffle", 
"pseudo", 
"random" };
 
  371     const std::string regexstr_prefix = std::string(
"^(") + ListUtils::concatenate<std::string>(affixes, 
"_*|") + 
"_*)";
 
  372     const std::string regexstr_suffix = std::string(
"(") + ListUtils::concatenate<std::string>(affixes, 
"_*|") + 
"_*)$";
 
  375     const boost::regex pattern_prefix(regexstr_prefix);
 
  376     const boost::regex pattern_suffix(regexstr_suffix);
 
  378     int all_prefix_occur(0), all_suffix_occur(0), all_proteins_count(0);
 
  380     constexpr 
size_t PROTEIN_CACHE_SIZE = 4e5;
 
  384       proteins.cacheChunk(PROTEIN_CACHE_SIZE);
 
  385       if (!proteins.activateCache()) 
break;
 
  387       auto prot_count = (
SignedSize)proteins.chunkSize();
 
  388       all_proteins_count += prot_count;
 
  393         String seq = proteins.chunkAt(i).identifier;
 
  399         bool found_prefix = boost::regex_search(seq_lower, sm, pattern_prefix);
 
  402           std::string match = sm[0];
 
  406           decoy_count[match].first++;
 
  410           decoy_case_sensitive[match] = seq_decoy;
 
  414         bool found_suffix = boost::regex_search(seq_lower, sm, pattern_suffix);
 
  417           std::string match = sm[0];
 
  421           decoy_count[match].second++;
 
  425           decoy_case_sensitive[match] = seq_decoy;
 
  431     for (
auto &a : decoy_count) 
OPENMS_LOG_DEBUG << a.first << 
"\t" << a.second.first << 
"\t" << a.second.second << std::endl;
 
  435     if (all_prefix_occur + all_suffix_occur < 0.4 * all_proteins_count)
 
  437       OPENMS_LOG_ERROR << 
"Unable to determine decoy string (not enough occurrences; <40%)!" << std::endl;
 
  438       return {
false, 
"?", 
true};
 
  441     if (all_prefix_occur == all_suffix_occur)
 
  443       OPENMS_LOG_ERROR << 
"Unable to determine decoy string (prefix and suffix occur equally often)!" << std::endl;
 
  444       return {
false, 
"?", 
true};
 
  448     for (
const auto& pair : decoy_count)
 
  450       const std::string & case_insensitive_decoy_string = pair.first;
 
  451       const std::pair<int, int>& prefix_suffix_counts = pair.second;
 
  452       double freq_prefix = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(all_prefix_occur);
 
  453       double freq_prefix_in_proteins = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(all_proteins_count);
 
  455       if (freq_prefix >= 0.8 && freq_prefix_in_proteins >= 0.4)
 
  457         if (prefix_suffix_counts.first != all_prefix_occur)
 
  459           OPENMS_LOG_WARN << 
"More than one decoy prefix observed!" << std::endl;
 
  460           OPENMS_LOG_WARN << 
"Using most frequent decoy prefix (" << (
int)(freq_prefix * 100) << 
"%)" << std::endl;
 
  463         return { 
true, decoy_case_sensitive[case_insensitive_decoy_string], 
true};
 
  468     for (
const auto& pair : decoy_count)
 
  470       const std::string& case_insensitive_decoy_string = pair.first;
 
  471       const std::pair<int, int>& prefix_suffix_counts = pair.second;
 
  472       double freq_suffix = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(all_suffix_occur);
 
  473       double freq_suffix_in_proteins = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(all_proteins_count);
 
  475       if (freq_suffix >= 0.8 && freq_suffix_in_proteins >= 0.4)
 
  477         if (prefix_suffix_counts.second != all_suffix_occur)
 
  479           OPENMS_LOG_WARN << 
"More than one decoy suffix observed!" << std::endl;
 
  480           OPENMS_LOG_WARN << 
"Using most frequent decoy suffix (" << (
int)(freq_suffix * 100) << 
"%)" << std::endl;
 
  483         return { 
true, decoy_case_sensitive[case_insensitive_decoy_string], 
false};
 
  487     OPENMS_LOG_ERROR << 
"Unable to determine decoy string and its position. Please provide a decoy string and its position as parameters." << std::endl;
 
  488     return {
false, 
"?", 
true};
 
  
bool cacheChunk(int)
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:281
size_t chunkSize() const
active data spans the full range, i.e. size of container
Definition: FASTAContainer.h:295
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
Retrieve a FASTA entry at cache position pos (fast)
Definition: FASTAContainer.h:163
This class serves for reading in and writing FASTA files.
Definition: FASTAFile.h:64
A more convenient string class.
Definition: String.h:59
const std::vector< FASTAFile::FASTAEntry > & data_
reference to existing data
Definition: FASTAContainer.h:333
size_t size() const
calls size() on underlying vector
Definition: FASTAContainer.h:320
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
fast access to chunked (i.e. all) entries
Definition: FASTAContainer.h:301
std::vector< FASTAFile::FASTAEntry > data_fg_
active (foreground) data
Definition: FASTAContainer.h:231
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:460
FASTA entry type (identifier, description and sequence)
Definition: FASTAFile.h:76
String name
Definition: FASTAContainer.h:347
static String prefix(const String &this_s, size_t length)
Definition: StringUtils.h:381
FASTAContainer(const String &FASTA_file)
C'tor with FASTA filename.
Definition: FASTAContainer.h:99
size_t chunkSize() const
number of entries in active cache
Definition: FASTAContainer.h:151
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
std::map< std::string, std::pair< int, int > > DecoyStringToAffixCount
Definition: FASTAContainer.h:492
bool empty() const
is the FASTA file empty?
Definition: FASTAContainer.h:202
static String suffix(const String &this_s, size_t length)
Definition: StringUtils.h:390
bool is_prefix
Definition: FASTAContainer.h:348
std::vector< std::streampos > offsets_
internal byte offsets into FASTA file for random access reading of previous entries.
Definition: FASTAContainer.h:230
size_t getChunkOffset() const
always 0, since this specialization requires/supports no chunking
Definition: FASTAContainer.h:259
#define OPENMS_LOG_DEBUG
Macro for general debugging information.
Definition: LogStream.h:470
FASTAFile f_
FASTA file connection.
Definition: FASTAContainer.h:229
bool cacheChunk(int suggested_size)
Prefetch a new cache in the background, with up to suggestedSize entries (or fewer upon reaching EOF)
Definition: FASTAContainer.h:135
void reset()
resets reading of the FASTA file, enables fresh reading of the FASTA from the beginning
Definition: FASTAContainer.h:208
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos)
Retrieve a FASTA entry at global position pos (must not be behind the currently active chunk,...
Definition: FASTAContainer.h:181
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos) const
fast access to an entry
Definition: FASTAContainer.h:307
size_t size() const
NOT the number of entries in the FASTA file, but merely the number of already read entries (since we ...
Definition: FASTAContainer.h:223
bool empty() const
calls empty() on underlying vector
Definition: FASTAContainer.h:314
Int overflow exception.
Definition: Exception.h:254
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
std::vector< FASTAFile::FASTAEntry > data_bg_
prefetched (background) data; will become the next active data
Definition: FASTAContainer.h:232
#define OPENMS_LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:455
size_t getChunkOffset() const
how many entries were read and got swapped out already
Definition: FASTAContainer.h:110
bool success
Definition: FASTAContainer.h:346
bool activateCache()
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:268
void reset()
required for template parameters!
Definition: FASTAContainer.h:326
bool activateCache()
Swaps in the background cache of entries, read previously via cacheChunk()
Definition: FASTAContainer.h:121
size_t chunk_offset_
number of entries before the current chunk
Definition: FASTAContainer.h:233
Definition: FASTAContainer.h:344
static Result findDecoyString(FASTAContainer< T > &proteins)
Heuristic to determine the decoy string given a set of protein names.
Definition: FASTAContainer.h:359
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:82
FASTAContainer(const std::vector< FASTAFile::FASTAEntry > &data)
C'tor for already existing data (by reference).
Definition: FASTAContainer.h:253
std::map< std::string, std::string > CaseInsensitiveToCaseSensitiveDecoy
Definition: FASTAContainer.h:493
Helper class for calculcations on decoy proteins.
Definition: FASTAContainer.h:341
String & toLower()
Converts the string to lowercase.