2025-10-09: Manual update

This commit is contained in:
Moussouni, Yaël
2025-10-09 16:14:16 +02:00
parent 2f5993471e
commit 6c17b76c37
4 changed files with 226 additions and 32 deletions

View File

@@ -31,6 +31,7 @@
## Directories ## Directories
VENV_DIR="./venv/" VENV_DIR="./venv/"
DB_DIR="./db/" DB_DIR="./db/"
PDF_DIR="../Bibliography/"
FILTERS_DIR="./cfg/filters/" FILTERS_DIR="./cfg/filters/"
## API Keys ## API Keys
@@ -41,4 +42,5 @@ ADSABS_API_KEY="$(cat cfg/adsabs.secret)"
ARXIV_QUERY_URL="https://export.arxiv.org/api/query?" ARXIV_QUERY_URL="https://export.arxiv.org/api/query?"
ARXIV_RSS_URL="https://rss.arxiv.org/atom/astro-ph" ARXIV_RSS_URL="https://rss.arxiv.org/atom/astro-ph"
ADSABS_QUERY_URL="https://api.adsabs.harvard.edu/v1/search/query?q=" ADSABS_QUERY_URL="https://api.adsabs.harvard.edu/v1/search/query?"
ADSABS_EXPORT_URL="https://api.adsabs.harvard.edu/v1/export/bibtexabs/"

View File

@@ -1,3 +1,10 @@
bibtexparser==2.0.0b8
certifi==2025.10.5
charset-normalizer==3.4.3
feedparser==6.0.12 feedparser==6.0.12
idna==3.10
numpy==2.3.3 numpy==2.3.3
pylatexenc==3.0a33
requests==2.32.5
sgmllib3k==1.0.0 sgmllib3k==1.0.0
urllib3==2.5.0

View File

@@ -30,7 +30,8 @@ source activate.sh
echo "=== [ Directory setup ] ===" echo "=== [ Directory setup ] ==="
mkdir -p -v cfg/filters mkdir -p -v cfg/filters
mkdir -p -v db mkdir -p -v db
touch db/saved.txt touch db/read.bib
touch db/unread.bib
chmod u+x *.sh chmod u+x *.sh
echo "=== Done ===" echo "=== Done ==="

View File

@@ -33,12 +33,19 @@ along with this program. If not, see www.gnu.org/licenses/.
import os import os
import textwrap as tw import textwrap as tw
import feedparser as fp import feedparser as fp
import bibtexparser as bib
import requests as rq
import numpy as np
from urllib.parse import urlencode, quote_plus
FILTERS_DIR = os.environ.get("FILTERS_DIR") FILTERS_DIR = os.environ.get("FILTERS_DIR")
DB_DIR = os.environ.get("DB_DIR") DB_DIR = os.environ.get("DB_DIR")
PDF_DIR = os.environ.get("PDF_DIR")
ARXIV_QUERY_URL = os.environ.get("ARXIV_QUERY_URL") ARXIV_QUERY_URL = os.environ.get("ARXIV_QUERY_URL")
ARXIV_RSS_URL = os.environ.get("ARXIV_RSS_URL") ARXIV_RSS_URL = os.environ.get("ARXIV_RSS_URL")
ADSABS_QUERY_URL = os.environ.get("ADSABS_QUERY_URL") ADSABS_QUERY_URL = os.environ.get("ADSABS_QUERY_URL")
ADSABS_EXPORT_URL = os.environ.get("ADSABS_EXPORT_URL")
ADSABS_API_KEY = os.environ.get("ADSABS_API_KEY")
COLOUR_DEFAULT="\033[0m" COLOUR_DEFAULT="\033[0m"
COLOUR_INPUT="\033[36m" COLOUR_INPUT="\033[36m"
@@ -53,9 +60,7 @@ def wrap(txt, length=80):
wrapped_txt = '\n'.join(tw.wrap(txt, length, break_long_words=False)) wrapped_txt = '\n'.join(tw.wrap(txt, length, break_long_words=False))
return wrapped_txt return wrapped_txt
def get_entries(rss): ## Filters
entries = rss["entries"]
return entries
def get_filters(): def get_filters():
filters = [] filters = []
@@ -63,11 +68,14 @@ def get_filters():
for i in range(len(filters_list)): for i in range(len(filters_list)):
path = FILTERS_DIR + filters_list[i] path = FILTERS_DIR + filters_list[i]
with open(path) as filter_file: with open(path) as filter_file:
dic = {"fields": [], "values": []} dic = {"fields": [], "values": [], "score": 1}
for line in filter_file.readlines(): for line in filter_file.readlines():
if "#FIELD" in line: if "#FIELD" in line:
field = line.split("=")[1].replace("\"", "").strip() field = line.split("=")[1].replace("\"", "").strip()
dic["fields"].append(field) dic["fields"].append(field)
elif "#SCORE" in line:
field = line.split("=")[1].strip()
dic["score"] = int(field)
elif line[0] == "#" or line in [" \n", "\n", ""]: elif line[0] == "#" or line in [" \n", "\n", ""]:
continue continue
else: else:
@@ -76,37 +84,51 @@ def get_filters():
filters.append(dic) filters.append(dic)
return filters return filters
## ArXiV Entries
def filter_entries(filters, entries): def filter_entries(filters, entries):
filtered_entries = [] filtered_entries = []
filtered_fields = [] filtered_fields = []
filtered_keywords = [] filtered_keywords = []
filtered_score = []
for entry in entries: for entry in entries:
added = False added = False
for filter_ in filters: for filter_ in filters:
fields = filter_["fields"] fields = filter_["fields"]
values = filter_["values"] values = filter_["values"]
score = filter_["score"]
for field in fields: for field in fields:
for value in values: for value in values:
if not added and value.upper() in str(entry[field]).upper(): if field in list(entry):
val = entry[field]
else:
val = ""
if not added and value.upper() in str(val).upper():
filtered_entries.append(entry) filtered_entries.append(entry)
filtered_fields.append([field]) filtered_fields.append([field])
filtered_keywords.append([value]) filtered_keywords.append([value])
filtered_score.append(score)
added = True added = True
elif added and value.upper() in str(entry[field]).upper(): elif added and value.upper() in str(val).upper():
filtered_score[-1] = filtered_score[-1] + score
if not field in filtered_fields[-1]: if not field in filtered_fields[-1]:
filtered_fields[-1].append(field) filtered_fields[-1].append(field)
if not value in filtered_keywords[-1]: if not value in filtered_keywords[-1]:
filtered_keywords[-1].append(value) filtered_keywords[-1].append(value)
return filtered_entries, filtered_fields, filtered_keywords filtered_data = {"fields": filtered_fields,
"keywords": filtered_keywords,
"score": filtered_score}
return filtered_entries, filtered_data
def print_entries(entries, fields=None, keywords=None): ## Print entries
def print_entries(entries, data=None):
for i in range(len(entries)): for i in range(len(entries)):
entry = entries[i] entry = entries[i]
print(COLOUR_INFO, end="") print(COLOUR_INFO, end="")
print(entry["id"], end="") if "bibcode" in list(entry):
print(entry["bibcode"], end="")
if "id" in list(entry):
print(entry["id"], end="")
if "arxiv_announce_type" in list(entry) : if "arxiv_announce_type" in list(entry) :
print(" (" + entry["arxiv_announce_type"] + ")", end="") print(" (" + entry["arxiv_announce_type"] + ")", end="")
print(" [" + entry["link"] + "]", end="") print(" [" + entry["link"] + "]", end="")
@@ -114,25 +136,28 @@ def print_entries(entries, fields=None, keywords=None):
print(COLOUR_DEFAULT + wrap(entry["title"]) + COLOUR_DEFAULT) print(COLOUR_DEFAULT + wrap(entry["title"]) + COLOUR_DEFAULT)
print(COLOUR_OUTPUT print(COLOUR_OUTPUT
+ wrap(", ".join([a["name"] for a in entry["authors"]])) + wrap(", ".join(entry["author"]))
+ COLOUR_DEFAULT) + COLOUR_DEFAULT)
print(COLOUR_INPUT print(COLOUR_INPUT
+ wrap("\n".join(entry["summary"].split("\n")[1:])) + wrap(entry["abstract"])
+ COLOUR_DEFAULT) + COLOUR_DEFAULT)
if fields is not None: if data is not None:
print(COLOUR_ERROR print(COLOUR_ERROR
+ "Filtered field(s): " + "Filtered field(s): "
+ ", ".join(fields[i]) + ", ".join(data["fields"][i])
+ COLOUR_DEFAULT) + COLOUR_DEFAULT)
if keywords is not None:
print(COLOUR_ERROR print(COLOUR_ERROR
+ "Filtered keyword(s): " + "Filtered keyword(s): "
+ ", ".join(keywords[i]) + ", ".join(data["keywords"][i])
+ COLOUR_DEFAULT)
print(COLOUR_ERROR
+ "Filtered score: "
+ str(data["score"][i])
+ COLOUR_DEFAULT) + COLOUR_DEFAULT)
print("") print("")
return 0 return 0
# ArXiV IDs # IDs
def get_arxiv_ids(entries): def get_arxiv_ids(entries):
ids = [] ids = []
@@ -169,12 +194,10 @@ def get_arxiv_rss():
def today_arxiv(): def today_arxiv():
filters = get_filters() filters = get_filters()
feed = get_arxiv_rss() feed = get_arxiv_rss()
entries = get_entries(feed) entries = get_arxiv_entries(feed)
entries, fields, keywords = filter_entries(filters, entries) entries, data = filter_entries(filters, entries)
ids = get_arxiv_ids(entries) print_entries(entries, data)
save_arxiv_ids(ids) return entries, data
print_entries(entries, fields, keywords)
return entries, fields, keywords
def get_arxiv_from_ids(ids): def get_arxiv_from_ids(ids):
if isinstance(ids, list) or isinstance(ids, np.ndarray): if isinstance(ids, list) or isinstance(ids, np.ndarray):
@@ -185,13 +208,174 @@ def get_arxiv_from_ids(ids):
raise Exception( raise Exception(
"The type of ids ({}) is not recognized".format(type(ids)) "The type of ids ({}) is not recognized".format(type(ids))
) )
query = ARXIV_QUERY_URL + "id_list=" + ",".join(ids) query = urlencode({"id_list": ",".join(ids)})
feed = fp.parse(query) url = ARXIV_QUERY_URL + query
feed = fp.parse(url)
return feed return feed
## ADS-ABS ## ADS-ABS
def get_adsabs_from_ids(ids): def ads_search(query, num=5, sort="date"):
query = urlencode({"q": query,
"fl": ("bibcode,title,author,abstract,bibstem,doi,"
"keyword,citation,pubdate"),
"rows": num,
"sort": sort})
url = ADSABS_QUERY_URL + query
header = "Bearer " + ADSABS_API_KEY
feed = rq.get(url, headers={'Authorization': header})
return feed
def ads_author(author, num=10, sort="date"):
filters = get_filters()
feed = ads_search("author:" + author, num=num, sort=sort)
entries = get_ads_entries(feed)
entries, data = filter_entries(filters, entries)
print_entries(entries, data)
return entries, data
# Entries
def get_arxiv_entries(rss):
entries_old = rss["entries"]
entries = []
for entry_old in entries_old:
entry = {}
entry["id"] = entry_old["id"].replace("oai:", "").replace("arXiv.org:", "")
entry["link"] = entry_old["link"]
entry["title"] = entry_old["title"]
tmp = []
for element in entry_old["authors"]:
if isinstance(element, dict):
tmp += element["name"].split(",")
entry["author"] = [a.strip() for a in tmp]
entry["abstract"] = "\n".join(entry_old["summary"].split("\n")[1:])[10:]
entry["pubdate"] = entry_old["published"][0:10]
entries.append(entry)
return entries
def get_ads_entries(feed):
num = len(feed.json()["response"]["docs"])
entries = []
for i in range(num):
entry = feed.json()["response"]["docs"][i]
entry["link"] = "https://ui.adsabs.harvard.edu/abs/" + entry["bibcode"]
entry["title"] = entry["title"][0]
entry["publisher"] = entry["bibstem"][0]
entries.append(entry)
return entries
# BibTeX
def arxiv_to_bibtex(entry,
arxtic_notes = "",
arxtic_category = "",
arxtic_keywords = "",
arxtic_score = 0,
arxtic_filename = ""):
key = entry["id"]
title = entry["title"]
author = " and ".join(entry["author"])
year = entry["pubdate"][0:4]
eprint = key
url = entry["link"]
bibentry = (f"@misc{{{key},\n"
f"\ttitle={{{title}}},\n"
f"\tauthor={{{author}}},\n"
f"\tyear={{{year}}},\n"
f"\teprint={{{eprint}}},\n"
f"\turl={{{url}}},\n"
f"\tarxtic_notes={{{arxtic_notes}}},\n"
f"\tarxtic_category={{{arxtic_category}}},\n"
f"\tarxtic_keywords={{{arxtic_keywords}}},\n"
f"\tarxtic_score={{{str(arxtic_score)}}},\n"
f"\tarxtic_filename={{{str(arxtic_filename)}}},\n"
"}")
bibtex = bib.parse_string(bibentry)
return bibtex
def ads_to_bibtex(entry,
arxtic_notes = "",
arxtic_category = "",
arxtic_keywords = "",
arxtic_score = 0,
arxtic_filename = ""):
bibcode = entry["bibcode"]
url = ADSABS_EXPORT_URL + bibcode
header = "Bearer " + ADSABS_API_KEY
feed = rq.get(url, headers={'Authorization': header})
bibentry = feed.text
bibentry = bibentry[:-2]
bibentry += (",\n"
f"\tarxtic_notes={{{arxtic_notes}}},\n"
f"\tarxtic_category={{{arxtic_category}}},\n"
f"\tarxtic_keywords={{{arxtic_keywords}}},\n"
f"\tarxtic_score={{{str(arxtic_score)}}},\n"
f"\tarxtic_filename={{{str(arxtic_filename)}}},\n"
"}")
bibtex = bib.parse_string(bibentry)
return bibtex
def list_pdf():
bibtex_list = []
pdf_names = [f for f in os.listdir(PDF_DIR)
if not f[0] == "." and ".pdf" in f]
for pdf_name in pdf_names:
fields = pdf_name.replace(".pdf", "").split("_")
if len(fields) < 2:
print(COLOUR_WARNING
+ f"Warning: {pdf_name} has not been correctly identified. "
+ "(unrecognized format #1)"
+ COLOUR+DEFAULT)
elif fields[1].upper() == "ARXIV":
arxiv_id = "/".join(fields[2:])
feed = get_arxiv_from_ids(arxiv_id)
entries = get_arxiv_entries(feed)
if len(entries) == 1:
entry = entries[0]
bibtex = arxiv_to_bibtex(entry,
arxtic_score=99,
arxtic_filename=pdf_name)
bibtex_list.append(bibtex)
else:
print(COLOUR_WARNING
+ f"Warning: {pdf_name} has not been correctly identified. "
+ "(ambiguous #1)"
+ COLOUR_DEFAULT)
elif len(fields) == 5:
first_author = fields[0]
year = fields[1]
bibstem = fields[2]
volume = fields[3]
page = fields[4]
if bibstem == "AA": bibstem = "A&A"
query=(f"first_author:\"{first_author}\""
f"year:({year})"
f"bibstem:\"{bibstem}\""
f"volume:\"{volume}\""
f"page:\"{page}\"")
feed = ads_search(query, num=2)
entries = get_ads_entries(feed)
if len(entries) == 1:
entry = entries[0]
bibtex = ads_to_bibtex(entry,
arxtic_score=99,
arxtic_filename=pdf_name)
bibtex_list.append(bibtex)
else:
print(COLOUR_WARNING
+ f"Warning: {pdf_name} has not been correctly identified. "
+ "(ambiguous #2)"
+ COLOUR_DEFAULT)
else:
print(COLOUR_WARNING
+ f"Warning: {pdf_name} has not been correctly identified. "
+ "(unrecognized format #2)"
+ COLOUR_DEFAULT)
return None return None
entries, fields, keywords = today_arxiv()
list_pdf()
#entries, data = today_arxiv()