37 UInt annotation_id{std::numeric_limits<UInt>::max()};
39 enum class Type { PSI_MOD, UNIMOD, GENERIC };
43 PEFFModification(
Size pos,
const std::string& acc,
const std::string& n,
const std::string& tag =
"",
UInt aid = std::numeric_limits<UInt>::max())
44 : position(pos), accession(acc), name(n), optional_tag(tag), annotation_id(aid)
46 if (StringUtils::hasPrefix(accession,
"MOD:"))
50 else if (StringUtils::hasPrefix(accession,
"UNIMOD:"))
59 name == rhs.
name && type == rhs.
type &&
72 char variant_aa{
'\0'};
74 UInt annotation_id{std::numeric_limits<UInt>::max()};
78 : position(pos), variant_aa(aa), optional_tag(tag), annotation_id(aid) {}
98 UInt annotation_id{std::numeric_limits<UInt>::max()};
102 : start_position(start), end_position(end), replacement(repl), optional_tag(tag), annotation_id(aid) {}
124 UInt annotation_id{std::numeric_limits<UInt>::max()};
127 PEFFProcessedRegion(
Size start,
Size end,
const std::string& acc,
const std::string& n =
"",
const std::string& tag =
"",
UInt aid = std::numeric_limits<UInt>::max())
128 : start_position(start), end_position(end), accession(acc), name(n), optional_tag(tag), annotation_id(aid) {}
148 UInt annotation_id{std::numeric_limits<UInt>::max()};
151 PEFFDisulfideBond(
const std::string& i1,
const std::string& i2,
const std::string& tag =
"",
UInt aid = std::numeric_limits<UInt>::max())
152 : id1(i1), id2(i2), optional_tag(tag), annotation_id(aid) {}
213 return prefix == rhs.
prefix &&
268 std::vector<std::string>& descriptions,
269 std::vector<AASequence>& sequences,
270 bool include_complex =
false)
const;
300 std::vector<std::string>& descriptions,
301 std::vector<AASequence>& sequences,
303 Size max_length = 40,
304 bool include_reference =
true,
305 bool include_variants =
true,
306 bool include_modifications =
false)
const;
327 std::vector<std::string>& descriptions,
328 std::vector<AASequence>& sequences,
329 const std::vector<std::string>& fixed_mods = {},
330 const std::vector<std::string>& variable_mods = {},
331 Size max_variable_mods_per_peptide = 2,
333 Size max_length = 40,
334 bool include_reference =
true,
335 bool include_peff_variants =
true,
336 bool include_peff_modifications =
true)
const;
353 const std::vector<std::pair<Size, const PEFFModification*>>& peff_mods,
354 const std::string& base_description);
384 std::string version{
"1.0"};
388 bool is_decoy{
false};
402 bool has_annotation_identifiers{
false};
403 bool is_proteoform_db{
false};
411 return version == rhs.
version &&
465 void load(
const std::string& filename,
466 std::vector<PEFFEntry>& entries,
467 std::vector<PEFFDatabaseMetadata>& headers)
const;
478 void store(
const std::string& filename,
479 const std::vector<PEFFEntry>& entries,
495 void store(
const std::string& filename,
496 const std::vector<PEFFEntry>& entries,
497 const std::vector<PEFFDatabaseMetadata>& headers)
const;
551 void writeStart(
const std::string& filename,
const std::vector<PEFFDatabaseMetadata>& headers);
608 std::string
formatHeader_(
const std::vector<PEFFDatabaseMetadata>& headers)
const;
615 bool readEntry_(std::string&
id, std::string& description, std::string& seq);
621 std::streampos fileSize_{0};
Representation of a peptide/protein sequence.
Definition AASequence.h:88
This class serves for reading and writing PEFF (PSI Extended FASTA Format) files.
Definition PEFFFile.h:447
void load(const std::string &filename, std::vector< PEFFEntry > &entries, std::vector< PEFFDatabaseMetadata > &headers) const
Loads a PEFF file and stores entries and headers.
void writeStart(const std::string &filename, const std::vector< PEFFDatabaseMetadata > &headers)
Prepares a PEFF file for streamed writing using writeNext(), with multiple headers.
void writeEnd()
Closes the output file (called automatically in destructor)
std::string formatHeader_(const std::vector< PEFFDatabaseMetadata > &headers) const
Format the header section for output (multiple database blocks)
PEFFFile()=default
Default constructor.
bool atEnd() const
Returns true if the end of the file has been reached.
std::string seq_
Current sequence buffer.
Definition PEFFFile.h:622
void store(const std::string &filename, const std::vector< PEFFEntry > &entries, const PEFFDatabaseMetadata &header) const
Stores entries to a PEFF file with the given header.
~PEFFFile() override=default
Destructor.
std::string formatHeader_(const PEFFDatabaseMetadata &header) const
Format the header section for output.
std::string formatEntry_(const PEFFEntry &entry) const
Format a single entry for output.
static bool isPEFFFile(const std::string &filename)
Checks if a file appears to be a PEFF file (by checking for # PEFF header).
void writeStart(const std::string &filename, const PEFFDatabaseMetadata &header)
Prepares a PEFF file for streamed writing using writeNext().
void parseHeaderLine_(const std::string &line, PEFFDatabaseMetadata &header, bool &new_db)
Parse a header line (# Key=Value or # //)
PEFFModification parseModification_(const std::string &tuple)
Parse a single modification tuple.
PEFFProcessedRegion parseProcessedRegion_(const std::string &tuple)
Parse a processed region tuple.
PEFFVariantComplex parseVariantComplex_(const std::string &tuple)
Parse a complex variant tuple.
bool readNext(PEFFEntry &entry)
Reads the next PEFF entry from the file.
std::ofstream outfile_
Output file stream.
Definition PEFFFile.h:618
void readStart(const std::string &filename)
Prepares a PEFF file for streamed reading using readNext().
void store(const std::string &filename, const std::vector< PEFFEntry > &entries, const std::vector< PEFFDatabaseMetadata > &headers) const
Stores entries to a PEFF file with multiple database headers.
void writeNext(const PEFFEntry &entry)
Writes the next PEFF entry to the file.
std::vector< std::string > parseParenList_(const std::string &value)
Parse a parenthesized list of values.
const std::vector< PEFFDatabaseMetadata > & getHeaders() const
Returns the headers parsed during readStart().
PEFFDisulfideBond parseDisulfideBond_(const std::string &tuple)
Parse a disulfide bond tuple.
bool readEntry_(std::string &id, std::string &description, std::string &seq)
Read entry data (identifier, description, sequence)
std::string id_
Current identifier buffer.
Definition PEFFFile.h:623
std::vector< PEFFDatabaseMetadata > headers_
Parsed headers.
Definition PEFFFile.h:619
std::fstream infile_
Input file stream.
Definition PEFFFile.h:617
std::string description_
Current description buffer.
Definition PEFFFile.h:624
PEFFVariantSimple parseVariantSimple_(const std::string &tuple)
Parse a simple variant tuple.
static std::string toProForma(const PEFFEntry &entry)
Converts a PEFF entry to ProForma notation.
void parseAnnotations_(const std::string &description, PEFFEntry &entry)
Parse annotations from the description line.
Base class for all classes that want to report their progress.
Definition ProgressLogger.h:27
Class for the enzymatic digestion of proteins represented as AASequence or String.
Definition ProteaseDigestion.h:32
int Int
Signed integer type.
Definition Types.h:72
unsigned int UInt
Unsigned integer type.
Definition Types.h:64
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition Types.h:97
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
Definition AhoCorasickAmbiguous.h:112
FASTA entry type (identifier, description and sequence) The first std::string corresponds to the iden...
Definition FASTAFile.h:46
Represents a custom key definition from the PEFF header.
Definition PEFFFile.h:361
std::vector< std::string > field_names
Definition PEFFFile.h:366
std::string concept_curie
Definition PEFFFile.h:364
std::string key_name
Definition PEFFFile.h:362
std::vector< std::string > field_types
Definition PEFFFile.h:367
std::string description
Definition PEFFFile.h:363
bool operator==(const PEFFCustomKeyDef &rhs) const
Definition PEFFFile.h:369
std::string regexp
Definition PEFFFile.h:365
Represents a disulfide bond annotation in PEFF.
Definition PEFFFile.h:144
UInt annotation_id
Optional annotation identifier, max() = not set.
Definition PEFFFile.h:148
PEFFDisulfideBond(const std::string &i1, const std::string &i2, const std::string &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:151
std::string id1
First cysteine reference (AnnotationIdentifier of the cysteine residue)
Definition PEFFFile.h:145
bool operator==(const PEFFDisulfideBond &rhs) const
Definition PEFFFile.h:154
std::string optional_tag
Optional tag (e.g., "between chains")
Definition PEFFFile.h:147
std::string id2
Second cysteine reference (AnnotationIdentifier of the cysteine residue)
Definition PEFFFile.h:146
PEFFDisulfideBond()=default
Represents a single entry in a PEFF file with all annotations.
Definition PEFFFile.h:174
void generatePeptides(const ProteaseDigestion &digestor, std::vector< std::string > &descriptions, std::vector< AASequence > &sequences, const std::vector< std::string > &fixed_mods={}, const std::vector< std::string > &variable_mods={}, Size max_variable_mods_per_peptide=2, Size min_length=6, Size max_length=40, bool include_reference=true, bool include_peff_variants=true, bool include_peff_modifications=true) const
Generate peptides with PEFF annotations and optional sample handling modifications.
bool operator==(const PEFFEntry &rhs) const
Definition PEFFFile.h:211
std::string entry_id
\ID (e.g., NPM_HUMAN)
Definition PEFFFile.h:190
Size sequence_length
\Length
Definition PEFFFile.h:185
PEFFEntry(const PEFFEntry &rhs)=default
std::map< std::string, std::string > custom_annotations
Definition PEFFFile.h:202
std::string entry_version
\EV
Definition PEFFFile.h:187
Int protein_existence
\PE (1-5)
Definition PEFFFile.h:188
AASequence getModifiedSequence() const
Get an AASequence with all annotated modifications applied.
std::vector< PEFFModification > modifications
Definition PEFFFile.h:194
std::string taxonomy_name
\TaxName
Definition PEFFFile.h:184
std::string identifier
Definition PEFFFile.h:177
static std::vector< std::pair< std::string, AASequence > > enumeratePEFFModifications_(const AASequence &peptide, const std::vector< std::pair< Size, const PEFFModification * > > &peff_mods, const std::string &base_description)
Apply PEFF modifications at specific positions to a peptide.
std::vector< PEFFProcessedRegion > processed_regions
Definition PEFFFile.h:197
void getVariantSequences(std::vector< std::string > &descriptions, std::vector< AASequence > &sequences, bool include_complex=false) const
Get all variant sequences (each variant applied individually).
std::string gene_name
\GName
Definition PEFFFile.h:182
PEFFEntry(PEFFEntry &&rhs) noexcept=default
AASequence getSequence() const
Get the base AASequence for this entry (unmodified sequence).
std::vector< std::string > alt_accessions
\AltAC - alternative accessions
Definition PEFFFile.h:191
AASequence getProcessedSequence(const std::string ®ion_accession="PEFF:0001021") const
Get processed sequence (e.g., mature protein without signal peptide).
std::vector< PEFFVariantSimple > simple_variants
Definition PEFFFile.h:195
std::string prefix
Database prefix from description line (e.g., "sp" from ">sp:P12345")
Definition PEFFFile.h:176
Int ncbi_tax_id
\NcbiTaxId or \OX
Definition PEFFFile.h:183
std::vector< PEFFDisulfideBond > disulfide_bonds
\DisulfideBond
Definition PEFFFile.h:198
std::string sequence
Definition PEFFFile.h:178
void digestWithVariants(const ProteaseDigestion &digestor, std::vector< std::string > &descriptions, std::vector< AASequence > &sequences, Size min_length=6, Size max_length=40, bool include_reference=true, bool include_variants=true, bool include_modifications=false) const
Generate all variant and/or modification peptides by digesting with a given protease.
PEFFEntry & operator=(PEFFEntry &&rhs) noexcept=default
std::vector< std::string > protein_names
\PName - may have multiple names
Definition PEFFFile.h:181
static PEFFEntry fromFASTAEntry(const FASTAFile::FASTAEntry &fasta)
Create a PEFFEntry from a FASTAEntry (basic fields only)
std::vector< PEFFVariantComplex > complex_variants
Definition PEFFFile.h:196
std::vector< std::string > proteoforms
ProForma notation.
Definition PEFFFile.h:199
FASTAFile::FASTAEntry toFASTAEntry() const
Convert to a FASTAFile::FASTAEntry (loses PEFF-specific annotations)
PEFFEntry & operator=(const PEFFEntry &rhs)=default
std::string sequence_version
\SV
Definition PEFFFile.h:186
std::string db_unique_id
\DbUniqueId
Definition PEFFFile.h:189
Represents a PEFF modification annotation.
Definition PEFFFile.h:32
bool operator==(const PEFFModification &rhs) const
Definition PEFFFile.h:56
PEFFModification()=default
UInt annotation_id
Optional annotation identifier (when HasAnnotationIdentifiers=true), max() = not set.
Definition PEFFFile.h:37
Type
Definition PEFFFile.h:39
std::string optional_tag
Optional tag (last component of annotation tuple)
Definition PEFFFile.h:36
std::string name
Human-readable name.
Definition PEFFFile.h:35
Type type
Definition PEFFFile.h:40
PEFFModification(Size pos, const std::string &acc, const std::string &n, const std::string &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:43
Size position
1-based position, 0 = unknown position (?)
Definition PEFFFile.h:33
std::string accession
"MOD:00046", "UNIMOD:35", or custom
Definition PEFFFile.h:34
Represents a PEFF processed region (signal peptide, transit peptide, etc.).
Definition PEFFFile.h:118
UInt annotation_id
Optional annotation identifier, max() = not set.
Definition PEFFFile.h:124
std::string optional_tag
Optional tag (last component of annotation tuple)
Definition PEFFFile.h:123
PEFFProcessedRegion()=default
PEFFProcessedRegion(Size start, Size end, const std::string &acc, const std::string &n="", const std::string &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:127
std::string name
Optional name (e.g., "signal peptide")
Definition PEFFFile.h:122
Size end_position
1-based end position
Definition PEFFFile.h:120
bool operator==(const PEFFProcessedRegion &rhs) const
Definition PEFFFile.h:130
Size start_position
1-based start position
Definition PEFFFile.h:119
std::string accession
PEFF CV accession (e.g., "PEFF:0001021")
Definition PEFFFile.h:121
Represents a complex PEFF variant (insertion, deletion, or substitution of multiple amino acids).
Definition PEFFFile.h:93
bool operator==(const PEFFVariantComplex &rhs) const
Definition PEFFFile.h:104
UInt annotation_id
Optional annotation identifier, max() = not set.
Definition PEFFFile.h:98
std::string optional_tag
Optional tag (last component of annotation tuple)
Definition PEFFFile.h:97
PEFFVariantComplex(Size start, Size end, const std::string &repl, const std::string &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:101
Size end_position
1-based end position
Definition PEFFFile.h:95
PEFFVariantComplex()=default
std::string replacement
Replacement sequence (empty = deletion)
Definition PEFFFile.h:96
Size start_position
1-based start position
Definition PEFFFile.h:94
Represents a simple PEFF variant (single amino acid substitution).
Definition PEFFFile.h:70
UInt annotation_id
Optional annotation identifier, max() = not set.
Definition PEFFFile.h:74
std::string optional_tag
Optional tag (last component of annotation tuple)
Definition PEFFFile.h:73
PEFFVariantSimple(Size pos, char aa, const std::string &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:77
bool operator==(const PEFFVariantSimple &rhs) const
Definition PEFFFile.h:80
PEFFVariantSimple()=default
char variant_aa
Variant amino acid.
Definition PEFFFile.h:72
Size position
1-based position
Definition PEFFFile.h:71