#!/usr/bin/env python #[TLP:AMBER] LIMITED DISTRIBUTION: WORK IN PROGRESS """ ArXtic: ArXtic queries arXiv and filters the output. @ Author: Moussouni, Yaël (MSc student; yael.moussouni@etu.unistra.fr) @ Institution: Université de Strasbourg, CNRS, Observatoire astronomique de Strasbourg, UMR 7550, F-67000 Strasbourg, France @ Date: 2025-09-15 Licence: ArXtic Copyright (C) 2025 Yaël Moussouni (yael.moussouni@etu.unistra.fr) arxtic.py Copyright (C) 2025 Yaël Moussouni (yael.moussouni@etu.unistra.fr) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see www.gnu.org/licenses/. """ import os import textwrap as tw import feedparser as fp FILTERS_DIR = os.environ.get("FILTERS_DIR") DB_DIR = os.environ.get("DB_DIR") ARXIV_QUERY_URL = os.environ.get("ARXIV_QUERY_URL") ARXIV_RSS_URL = os.environ.get("ARXIV_RSS_URL") ADSABS_QUERY_URL = os.environ.get("ADSABS_QUERY_URL") COLOUR_DEFAULT="\033[0m" COLOUR_INPUT="\033[36m" COLOUR_OUTPUT="\033[32m" COLOUR_INFO="\033[34m" COLOUR_WARNING="\033[93m" COLOUR_ERROR="\033[91m" ## General def wrap(txt, length=80): wrapped_txt = '\n'.join(tw.wrap(txt, length, break_long_words=False)) return wrapped_txt def get_entries(rss): entries = rss["entries"] return entries def get_filters(): filters = [] filters_list = [f for f in os.listdir(FILTERS_DIR) if not f[0] == "."] for i in range(len(filters_list)): path = FILTERS_DIR + filters_list[i] with open(path) as filter_file: dic = {"fields": [], "values": []} for line in filter_file.readlines(): if "#FIELD" in line: field = line.split("=")[1].replace("\"", "").strip() dic["fields"].append(field) elif line[0] == "#" or line in [" \n", "\n", ""]: continue else: value = line.replace("\n", "") dic["values"].append(value) filters.append(dic) return filters ## ArXiV Entries def filter_entries(filters, entries): filtered_entries = [] filtered_fields = [] filtered_keywords = [] for entry in entries: added = False for filter_ in filters: fields = filter_["fields"] values = filter_["values"] for field in fields: for value in values: if not added and value.upper() in str(entry[field]).upper(): filtered_entries.append(entry) filtered_fields.append([field]) filtered_keywords.append([value]) added = True elif added and value.upper() in str(entry[field]).upper(): if not field in filtered_fields[-1]: filtered_fields[-1].append(field) if not value in filtered_keywords[-1]: filtered_keywords[-1].append(value) return filtered_entries, filtered_fields, filtered_keywords def print_entries(entries, fields=None, keywords=None): for i in range(len(entries)): entry = entries[i] print(COLOUR_INFO, end="") print(entry["id"], end="") if "arxiv_announce_type" in list(entry) : print(" (" + entry["arxiv_announce_type"] + ")", end="") print(" [" + entry["link"] + "]", end="") print(COLOUR_DEFAULT) print(COLOUR_DEFAULT + wrap(entry["title"]) + COLOUR_DEFAULT) print(COLOUR_OUTPUT + wrap(", ".join([a["name"] for a in entry["authors"]])) + COLOUR_DEFAULT) print(COLOUR_INPUT + wrap("\n".join(entry["summary"].split("\n")[1:])) + COLOUR_DEFAULT) if fields is not None: print(COLOUR_ERROR + "Filtered field(s): " + ", ".join(fields[i]) + COLOUR_DEFAULT) if keywords is not None: print(COLOUR_ERROR + "Filtered keyword(s): " + ", ".join(keywords[i]) + COLOUR_DEFAULT) print("") return 0 # ArXiV IDs def get_arxiv_ids(entries): ids = [] for entry in entries: ids.append(entry["id"]) return ids def save_arxiv_ids(ids, library="saved"): if isinstance(ids, list) or isinstance(ids, np.ndarray): ids = [i.replace("oai:", "").replace("arXiv.org:", "") for i in ids] elif isinstance(ids, str): ids = [ids.replace("oai:", "").replace("arXiv.org:", "")] else: raise Exception( "The type of ids ({}) is not recognized".format(type(ids)) ) with open(DB_DIR + library + ".txt", "a+") as db_file: None # creates the file if not already in the directory with open(DB_DIR + library + ".txt", "r+") as db_file: known_ids = [line.replace("\n", "") for line in db_file.readlines()] with open(DB_DIR + library + ".txt", "a+") as db_file: for i in ids: if not i in known_ids: db_file.write(i) db_file.write("\n") return 0 ## ArXiV def get_arxiv_rss(): feed = fp.parse(ARXIV_RSS_URL) return feed def today_arxiv(): filters = get_filters() feed = get_arxiv_rss() entries = get_entries(feed) entries, fields, keywords = filter_entries(filters, entries) ids = get_arxiv_ids(entries) save_arxiv_ids(ids) print_entries(entries, fields, keywords) return entries, fields, keywords def get_arxiv_from_ids(ids): if isinstance(ids, list) or isinstance(ids, np.ndarray): ids = [i.replace("oai:", "").replace("arXiv.org:", "") for i in ids] elif isinstance(ids, str): ids = [ids.replace("oai:", "").replace("arXiv.org:", "")] else: raise Exception( "The type of ids ({}) is not recognized".format(type(ids)) ) query = ARXIV_QUERY_URL + "id_list=" + ",".join(ids) feed = fp.parse(query) return feed ## ADS-ABS def get_adsabs_from_ids(ids): return None entries, fields, keywords = today_arxiv()