2025-10-07: Update

This commit is contained in:
Moussouni, Yaël
2025-10-07 09:54:38 +02:00
parent f5150f51ce
commit 9f7a5ef1fb
4 changed files with 86 additions and 69 deletions

1
.gitignore vendored
View File

@@ -6,3 +6,4 @@ log/
venv/ venv/
.venv/ .venv/
filters/ filters/
*.secret

View File

@@ -33,6 +33,12 @@ VENV_DIR="./venv/"
DB_DIR="./db/" DB_DIR="./db/"
FILTERS_DIR="./cfg/filters/" FILTERS_DIR="./cfg/filters/"
## API Keys
ARXIV_API_KEY=""
ADSABS_API_KEY="$(cat cfg/adsabs.secret)"
## Remote URLs ## Remote URLs
QUERY_URL="https://export.arxiv.org/api/query?" ARXIV_QUERY_URL="https://export.arxiv.org/api/query?"
RSS_URL="https://rss.arxiv.org/atom/astro-ph" ARXIV_RSS_URL="https://rss.arxiv.org/atom/astro-ph"
ADSABS_QUERY_URL="https://api.adsabs.harvard.edu/v1/search/query?q="

View File

@@ -2,3 +2,5 @@
2509.15337v1 2509.15337v1
2509.15720v1 2509.15720v1
2509.16168v1 2509.16168v1
2509.18650v1
2509.19068v1

View File

@@ -36,8 +36,9 @@ import feedparser as fp
FILTERS_DIR = os.environ.get("FILTERS_DIR") FILTERS_DIR = os.environ.get("FILTERS_DIR")
DB_DIR = os.environ.get("DB_DIR") DB_DIR = os.environ.get("DB_DIR")
QUERY_URL = os.environ.get("QUERY_URL") ARXIV_QUERY_URL = os.environ.get("ARXIV_QUERY_URL")
RSS_URL = os.environ.get("RSS_URL") ARXIV_RSS_URL = os.environ.get("ARXIV_RSS_URL")
ADSABS_QUERY_URL = os.environ.get("ADSABS_QUERY_URL")
COLOUR_DEFAULT="\033[0m" COLOUR_DEFAULT="\033[0m"
COLOUR_INPUT="\033[36m" COLOUR_INPUT="\033[36m"
@@ -46,18 +47,60 @@ COLOUR_INFO="\033[34m"
COLOUR_WARNING="\033[93m" COLOUR_WARNING="\033[93m"
COLOUR_ERROR="\033[91m" COLOUR_ERROR="\033[91m"
## General
def wrap(txt, length=80): def wrap(txt, length=80):
wrapped_txt = '\n'.join(tw.wrap(txt, length, break_long_words=False)) wrapped_txt = '\n'.join(tw.wrap(txt, length, break_long_words=False))
return wrapped_txt return wrapped_txt
def get_rss():
feed = fp.parse(RSS_URL)
return feed
def get_entries(rss): def get_entries(rss):
entries = rss["entries"] entries = rss["entries"]
return entries return entries
def get_filters():
filters = []
filters_list = [f for f in os.listdir(FILTERS_DIR) if not f[0] == "."]
for i in range(len(filters_list)):
path = FILTERS_DIR + filters_list[i]
with open(path) as filter_file:
dic = {"fields": [], "values": []}
for line in filter_file.readlines():
if "#FIELD" in line:
field = line.split("=")[1].replace("\"", "").strip()
dic["fields"].append(field)
elif line[0] == "#" or line in [" \n", "\n", ""]:
continue
else:
value = line.replace("\n", "")
dic["values"].append(value)
filters.append(dic)
return filters
## ArXiV Entries
def filter_entries(filters, entries):
filtered_entries = []
filtered_fields = []
filtered_keywords = []
for entry in entries:
added = False
for filter_ in filters:
fields = filter_["fields"]
values = filter_["values"]
for field in fields:
for value in values:
if not added and value.upper() in str(entry[field]).upper():
filtered_entries.append(entry)
filtered_fields.append([field])
filtered_keywords.append([value])
added = True
elif added and value.upper() in str(entry[field]).upper():
if not field in filtered_fields[-1]:
filtered_fields[-1].append(field)
if not value in filtered_keywords[-1]:
filtered_keywords[-1].append(value)
return filtered_entries, filtered_fields, filtered_keywords
def print_entries(entries, fields=None, keywords=None): def print_entries(entries, fields=None, keywords=None):
for i in range(len(entries)): for i in range(len(entries)):
entry = entries[i] entry = entries[i]
@@ -89,67 +132,27 @@ def print_entries(entries, fields=None, keywords=None):
print("") print("")
return 0 return 0
def get_filters(): # ArXiV IDs
filters = []
filters_list = [f for f in os.listdir(FILTERS_DIR) if not f[0] == "."]
for i in range(len(filters_list)):
path = FILTERS_DIR + filters_list[i]
with open(path) as filter_file:
dic = {"fields": [], "values": []}
for line in filter_file.readlines():
if "#FIELD" in line:
field = line.split("=")[1].replace("\"", "").strip()
dic["fields"].append(field)
elif line[0] == "#" or line in [" \n", "\n", ""]:
continue
else:
value = line.replace("\n", "")
dic["values"].append(value)
filters.append(dic)
return filters
def filter_entries(filters, entries): def get_arxiv_ids(entries):
filtered_entries = []
filtered_fields = []
filtered_keywords = []
for entry in entries:
added = False
for filter_ in filters:
fields = filter_["fields"]
values = filter_["values"]
for field in fields:
for value in values:
if not added and value.upper() in str(entry[field]).upper():
filtered_entries.append(entry)
filtered_fields.append([field])
filtered_keywords.append([value])
added = True
elif added and value.upper() in str(entry[field]).upper():
if not field in filtered_fields[-1]:
filtered_fields[-1].append(field)
if not value in filtered_keywords[-1]:
filtered_keywords[-1].append(value)
return filtered_entries, filtered_fields, filtered_keywords
def get_ids(entries):
ids = [] ids = []
for entry in entries: for entry in entries:
ids.append(entry["id"]) ids.append(entry["id"])
return ids return ids
def save_ids(ids, library="saved"): def save_arxiv_ids(ids, library="saved"):
if isinstance(ids, list) or isinstance(ids, np.ndarray): if isinstance(ids, list) or isinstance(ids, np.ndarray):
ids = [i.replace("oai:", "").replace("arXiv.org:", "") for i in ids] ids = [i.replace("oai:", "").replace("arXiv.org:", "") for i in ids]
elif isinstance(ids, str): elif isinstance(ids, str):
ids = [ids.replace("oai:", "").replace("arXiv.org:", "")] ids = [ids.replace("oai:", "").replace("arXiv.org:", "")]
else: else:
raise Exception("The type of ids ({}) is not recognized".format(type(ids))) raise Exception(
"The type of ids ({}) is not recognized".format(type(ids))
)
with open(DB_DIR + library + ".txt", "a+") as db_file: with open(DB_DIR + library + ".txt", "a+") as db_file:
None # creates the file if not already in the directory None # creates the file if not already in the directory
with open(DB_DIR + library + ".txt", "r+") as db_file: with open(DB_DIR + library + ".txt", "r+") as db_file:
known_ids = [line.replace("\n", "") for line in db_file.readlines()] known_ids = [line.replace("\n", "") for line in db_file.readlines()]
print(known_ids)
with open(DB_DIR + library + ".txt", "a+") as db_file: with open(DB_DIR + library + ".txt", "a+") as db_file:
for i in ids: for i in ids:
if not i in known_ids: if not i in known_ids:
@@ -157,34 +160,39 @@ def save_ids(ids, library="saved"):
db_file.write("\n") db_file.write("\n")
return 0 return 0
## ArXiV
def get_arxiv_rss():
feed = fp.parse(ARXIV_RSS_URL)
return feed
def today_arxiv(): def today_arxiv():
filters = get_filters() filters = get_filters()
feed = get_rss() feed = get_arxiv_rss()
entries = get_entries(feed) entries = get_entries(feed)
entries, fields, keywords = filter_entries(filters, entries) entries, fields, keywords = filter_entries(filters, entries)
ids = get_ids(entries) ids = get_arxiv_ids(entries)
save_ids(ids) save_arxiv_ids(ids)
print_entries(entries, fields, keywords) print_entries(entries, fields, keywords)
return entries, fields return entries, fields, keywords
def get_api_ids(ids): def get_arxiv_from_ids(ids):
if isinstance(ids, list) or isinstance(ids, np.ndarray): if isinstance(ids, list) or isinstance(ids, np.ndarray):
ids = [i.replace("oai:", "").replace("arXiv.org:", "") for i in ids] ids = [i.replace("oai:", "").replace("arXiv.org:", "") for i in ids]
elif isinstance(ids, str): elif isinstance(ids, str):
ids = [ids.replace("oai:", "").replace("arXiv.org:", "")] ids = [ids.replace("oai:", "").replace("arXiv.org:", "")]
else: else:
raise Exception("The type of ids ({}) is not recognized".format(type(ids))) raise Exception(
query = QUERY_URL + "id_list=" + ",".join(ids) "The type of ids ({}) is not recognized".format(type(ids))
)
query = ARXIV_QUERY_URL + "id_list=" + ",".join(ids)
feed = fp.parse(query) feed = fp.parse(query)
return feed return feed
## ADS-ABS
def get_adsabs_from_ids(ids):
return None
""" entries, fields, keywords = today_arxiv()
ids = ["oai:arXiv.org:2509.13163"]
feed = get_api_ids(ids)
entries = get_entries(feed)
print_entries(entries)
"""
today_arxiv()