🚀 ESM-2 Sequence Classifier
This is a small sequence classifier designed to classify protein sequences into three categories: enzymes
(class 0
), receptor_proteins
(class 1
), and structural_proteins
(class 2
). It is trained on synthetic data generated by GPT-4, leveraging facebook/esm2_t6_8M_UR50D, one of the ESM-2 models.
Please note that this model is for experimental and educational purposes only, as it has not undergone extensive testing. Use it with caution.
🚀 Quick Start
✨ Features
- Classify protein sequences into three categories:
enzymes
, receptor_proteins
, and structural_proteins
.
- Trained on synthetic data generated by GPT-4.
- Utilizes the facebook/esm2_t6_8M_UR50D model.
💻 Usage Examples
Basic Usage
model = EsmForSequenceClassification.from_pretrained("AmelieSchreiber/esm2_t6_8M_UR50D_sequence_classifier_v1")
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
new_sequences_0 = [
"ACGYLKTPKLADPPVLRGDSSVTKAICKPDPVLEK",
"GVALDECKALDYLPGKPLPMDGKVCQCGSKTPLRP",
"VLPGYTCGELDCKPGKPLPKCGADKTQVATPFLRG",
"TCGALVQYPSCADPPVLRGSDSSVKACKKLDPQDK",
"GALCEECKLCPGADYKPMDGDRLPAAATSKTRPVG",
"PAVDCKKALVYLPKPLPMDGKVCRGSKTPKTRPYG",
"VLGYTCGALDCKPGKPLPKCGADKTQVATPFLRGA",
"CGALVQYPSCADPPVLRGSDSSVKACKKLDPQDKT",
"ALCEECKLCPGADYKPMDGDRLPAAATSKTRPVGK",
"AVDCKKALVYLPKPLPMDGKVCRGSKTPKTRPYGR",
]
new_sequences_1 = [
"VGQRFYGGRQKNRHCELSPLPSACRGSVQGALYTD",
"KDQVLTVPTYACRCCPKMDSKGRVPSTLRVKSARS",
"PLAGVACGRGLDYRCPRKMVPGDLQVTPATQRPYG",
"CGVRLGYPGCADVPLRGRSSFAPRACMKKDPRVTR",
"RKGVAYLYECRKLRCRADYKPRGMDGRRLPKASTT",
"RPTGAVNCKQAKVYRGLPLPMMGKVPRVCRSRRPY",
"RLDGGYTCGQALDCKPGRKPPKMGCADLKSTVATP",
"LGTCRKLVRYPQCADPPVMGRSSFRPKACCRQDPV",
"RVGYAMCSPKLCSCRADYKPPMGDGDRLPKAATSK",
"QPKAVNCRKAMVYRPKPLPMDKGVPVCRSKRPRPY",
]
new_sequences_2 = [
"VGKGFRYGSSQKRYLHCQKSALPPSCRRGKGQGSAT",
"KDPTVMTVGTYSCQCPKQDSRGSVQPTSRVKTSRSK",
"PLVGKACGRSSDYKCPGQMVSGGSKQTPASQRPSYD",
"CGKKLVGYPSSKADVPLQGRSSFSPKACKKDPQMTS",
"RKGVASLYCSSKLSCKAQYSKGMSDGRSPKASSTTS",
"RPKSAASCEQAKSYRSLSLPSMKGKVPSKCSRSKRP",
"RSDVSYTSCSQSKDCKPSKPPKMSGSKDSSTVATPS",
"LSTCSKKVAYPSSKADPPSSGRSSFSMKACKKQDPPV",
"RVGSASSEPKSSCSVQSYSKPSMSGDSSPKASSTSK",
"QPSASNCEKMSSYRPSLPSMSKGVPSSRSKSSPPYQ",
]
new_sequences = new_sequences_0 + new_sequences_1 + new_sequences_2
inputs = tokenizer(new_sequences, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
logits = model(**inputs).logits
predicted_class_ids = torch.argmax(logits, dim=-1)
for sequence, predicted_class in zip(new_sequences, predicted_class_ids):
print(f"Sequence: {sequence}, Predicted class: {predicted_class.item()}")
📄 License
This project is licensed under the MIT License.