diff --git a/cfg/general_config.cfg b/cfg/general_config.cfg index d812b33..b134ca8 100644 --- a/cfg/general_config.cfg +++ b/cfg/general_config.cfg @@ -31,6 +31,7 @@ ## Directories VENV_DIR="./venv/" DB_DIR="./db/" +PDF_DIR="../Bibliography/" FILTERS_DIR="./cfg/filters/" ## API Keys @@ -41,4 +42,5 @@ ADSABS_API_KEY="$(cat cfg/adsabs.secret)" ARXIV_QUERY_URL="https://export.arxiv.org/api/query?" ARXIV_RSS_URL="https://rss.arxiv.org/atom/astro-ph" -ADSABS_QUERY_URL="https://api.adsabs.harvard.edu/v1/search/query?q=" +ADSABS_QUERY_URL="https://api.adsabs.harvard.edu/v1/search/query?" +ADSABS_EXPORT_URL="https://api.adsabs.harvard.edu/v1/export/bibtexabs/" diff --git a/cfg/requirements.txt b/cfg/requirements.txt index 008018a..4f7d591 100644 --- a/cfg/requirements.txt +++ b/cfg/requirements.txt @@ -1,3 +1,10 @@ +bibtexparser==2.0.0b8 +certifi==2025.10.5 +charset-normalizer==3.4.3 feedparser==6.0.12 +idna==3.10 numpy==2.3.3 +pylatexenc==3.0a33 +requests==2.32.5 sgmllib3k==1.0.0 +urllib3==2.5.0 diff --git a/setup.sh b/setup.sh index a5b1f71..2f48983 100755 --- a/setup.sh +++ b/setup.sh @@ -30,7 +30,8 @@ source activate.sh echo "=== [ Directory setup ] ===" mkdir -p -v cfg/filters mkdir -p -v db -touch db/saved.txt +touch db/read.bib +touch db/unread.bib chmod u+x *.sh echo "=== Done ===" diff --git a/src/arxtic.py b/src/arxtic.py index 04ad236..aaae729 100644 --- a/src/arxtic.py +++ b/src/arxtic.py @@ -33,12 +33,19 @@ along with this program. If not, see www.gnu.org/licenses/. import os import textwrap as tw import feedparser as fp +import bibtexparser as bib +import requests as rq +import numpy as np +from urllib.parse import urlencode, quote_plus FILTERS_DIR = os.environ.get("FILTERS_DIR") DB_DIR = os.environ.get("DB_DIR") +PDF_DIR = os.environ.get("PDF_DIR") ARXIV_QUERY_URL = os.environ.get("ARXIV_QUERY_URL") ARXIV_RSS_URL = os.environ.get("ARXIV_RSS_URL") ADSABS_QUERY_URL = os.environ.get("ADSABS_QUERY_URL") +ADSABS_EXPORT_URL = os.environ.get("ADSABS_EXPORT_URL") +ADSABS_API_KEY = os.environ.get("ADSABS_API_KEY") COLOUR_DEFAULT="\033[0m" COLOUR_INPUT="\033[36m" @@ -53,9 +60,7 @@ def wrap(txt, length=80): wrapped_txt = '\n'.join(tw.wrap(txt, length, break_long_words=False)) return wrapped_txt -def get_entries(rss): - entries = rss["entries"] - return entries +## Filters def get_filters(): filters = [] @@ -63,11 +68,14 @@ def get_filters(): for i in range(len(filters_list)): path = FILTERS_DIR + filters_list[i] with open(path) as filter_file: - dic = {"fields": [], "values": []} + dic = {"fields": [], "values": [], "score": 1} for line in filter_file.readlines(): if "#FIELD" in line: field = line.split("=")[1].replace("\"", "").strip() dic["fields"].append(field) + elif "#SCORE" in line: + field = line.split("=")[1].strip() + dic["score"] = int(field) elif line[0] == "#" or line in [" \n", "\n", ""]: continue else: @@ -76,37 +84,51 @@ def get_filters(): filters.append(dic) return filters -## ArXiV Entries - def filter_entries(filters, entries): filtered_entries = [] filtered_fields = [] filtered_keywords = [] + filtered_score = [] for entry in entries: added = False for filter_ in filters: fields = filter_["fields"] values = filter_["values"] + score = filter_["score"] for field in fields: for value in values: - if not added and value.upper() in str(entry[field]).upper(): + if field in list(entry): + val = entry[field] + else: + val = "" + if not added and value.upper() in str(val).upper(): filtered_entries.append(entry) filtered_fields.append([field]) filtered_keywords.append([value]) + filtered_score.append(score) added = True - elif added and value.upper() in str(entry[field]).upper(): + elif added and value.upper() in str(val).upper(): + filtered_score[-1] = filtered_score[-1] + score if not field in filtered_fields[-1]: filtered_fields[-1].append(field) if not value in filtered_keywords[-1]: filtered_keywords[-1].append(value) - return filtered_entries, filtered_fields, filtered_keywords + filtered_data = {"fields": filtered_fields, + "keywords": filtered_keywords, + "score": filtered_score} + return filtered_entries, filtered_data -def print_entries(entries, fields=None, keywords=None): +## Print entries + +def print_entries(entries, data=None): for i in range(len(entries)): entry = entries[i] - + print(COLOUR_INFO, end="") - print(entry["id"], end="") + if "bibcode" in list(entry): + print(entry["bibcode"], end="") + if "id" in list(entry): + print(entry["id"], end="") if "arxiv_announce_type" in list(entry) : print(" (" + entry["arxiv_announce_type"] + ")", end="") print(" [" + entry["link"] + "]", end="") @@ -114,25 +136,28 @@ def print_entries(entries, fields=None, keywords=None): print(COLOUR_DEFAULT + wrap(entry["title"]) + COLOUR_DEFAULT) print(COLOUR_OUTPUT - + wrap(", ".join([a["name"] for a in entry["authors"]])) + + wrap(", ".join(entry["author"])) + COLOUR_DEFAULT) print(COLOUR_INPUT - + wrap("\n".join(entry["summary"].split("\n")[1:])) + + wrap(entry["abstract"]) + COLOUR_DEFAULT) - if fields is not None: + if data is not None: print(COLOUR_ERROR + "Filtered field(s): " - + ", ".join(fields[i]) + + ", ".join(data["fields"][i]) + COLOUR_DEFAULT) - if keywords is not None: print(COLOUR_ERROR + "Filtered keyword(s): " - + ", ".join(keywords[i]) + + ", ".join(data["keywords"][i]) + + COLOUR_DEFAULT) + print(COLOUR_ERROR + + "Filtered score: " + + str(data["score"][i]) + COLOUR_DEFAULT) print("") return 0 -# ArXiV IDs +# IDs def get_arxiv_ids(entries): ids = [] @@ -169,12 +194,10 @@ def get_arxiv_rss(): def today_arxiv(): filters = get_filters() feed = get_arxiv_rss() - entries = get_entries(feed) - entries, fields, keywords = filter_entries(filters, entries) - ids = get_arxiv_ids(entries) - save_arxiv_ids(ids) - print_entries(entries, fields, keywords) - return entries, fields, keywords + entries = get_arxiv_entries(feed) + entries, data = filter_entries(filters, entries) + print_entries(entries, data) + return entries, data def get_arxiv_from_ids(ids): if isinstance(ids, list) or isinstance(ids, np.ndarray): @@ -185,13 +208,174 @@ def get_arxiv_from_ids(ids): raise Exception( "The type of ids ({}) is not recognized".format(type(ids)) ) - query = ARXIV_QUERY_URL + "id_list=" + ",".join(ids) - feed = fp.parse(query) + query = urlencode({"id_list": ",".join(ids)}) + url = ARXIV_QUERY_URL + query + feed = fp.parse(url) return feed ## ADS-ABS -def get_adsabs_from_ids(ids): +def ads_search(query, num=5, sort="date"): + query = urlencode({"q": query, + "fl": ("bibcode,title,author,abstract,bibstem,doi," + "keyword,citation,pubdate"), + "rows": num, + "sort": sort}) + url = ADSABS_QUERY_URL + query + header = "Bearer " + ADSABS_API_KEY + feed = rq.get(url, headers={'Authorization': header}) + return feed + +def ads_author(author, num=10, sort="date"): + filters = get_filters() + feed = ads_search("author:" + author, num=num, sort=sort) + entries = get_ads_entries(feed) + entries, data = filter_entries(filters, entries) + print_entries(entries, data) + return entries, data + +# Entries + +def get_arxiv_entries(rss): + entries_old = rss["entries"] + entries = [] + for entry_old in entries_old: + entry = {} + entry["id"] = entry_old["id"].replace("oai:", "").replace("arXiv.org:", "") + entry["link"] = entry_old["link"] + entry["title"] = entry_old["title"] + tmp = [] + for element in entry_old["authors"]: + if isinstance(element, dict): + tmp += element["name"].split(",") + entry["author"] = [a.strip() for a in tmp] + entry["abstract"] = "\n".join(entry_old["summary"].split("\n")[1:])[10:] + entry["pubdate"] = entry_old["published"][0:10] + entries.append(entry) + return entries + + +def get_ads_entries(feed): + num = len(feed.json()["response"]["docs"]) + entries = [] + for i in range(num): + entry = feed.json()["response"]["docs"][i] + entry["link"] = "https://ui.adsabs.harvard.edu/abs/" + entry["bibcode"] + entry["title"] = entry["title"][0] + entry["publisher"] = entry["bibstem"][0] + entries.append(entry) + return entries + +# BibTeX + +def arxiv_to_bibtex(entry, + arxtic_notes = "", + arxtic_category = "", + arxtic_keywords = "", + arxtic_score = 0, + arxtic_filename = ""): + key = entry["id"] + title = entry["title"] + author = " and ".join(entry["author"]) + year = entry["pubdate"][0:4] + eprint = key + url = entry["link"] + bibentry = (f"@misc{{{key},\n" + f"\ttitle={{{title}}},\n" + f"\tauthor={{{author}}},\n" + f"\tyear={{{year}}},\n" + f"\teprint={{{eprint}}},\n" + f"\turl={{{url}}},\n" + f"\tarxtic_notes={{{arxtic_notes}}},\n" + f"\tarxtic_category={{{arxtic_category}}},\n" + f"\tarxtic_keywords={{{arxtic_keywords}}},\n" + f"\tarxtic_score={{{str(arxtic_score)}}},\n" + f"\tarxtic_filename={{{str(arxtic_filename)}}},\n" + "}") + bibtex = bib.parse_string(bibentry) + return bibtex + +def ads_to_bibtex(entry, + arxtic_notes = "", + arxtic_category = "", + arxtic_keywords = "", + arxtic_score = 0, + arxtic_filename = ""): + bibcode = entry["bibcode"] + url = ADSABS_EXPORT_URL + bibcode + header = "Bearer " + ADSABS_API_KEY + feed = rq.get(url, headers={'Authorization': header}) + bibentry = feed.text + bibentry = bibentry[:-2] + bibentry += (",\n" + f"\tarxtic_notes={{{arxtic_notes}}},\n" + f"\tarxtic_category={{{arxtic_category}}},\n" + f"\tarxtic_keywords={{{arxtic_keywords}}},\n" + f"\tarxtic_score={{{str(arxtic_score)}}},\n" + f"\tarxtic_filename={{{str(arxtic_filename)}}},\n" + "}") + bibtex = bib.parse_string(bibentry) + return bibtex + +def list_pdf(): + bibtex_list = [] + pdf_names = [f for f in os.listdir(PDF_DIR) + if not f[0] == "." and ".pdf" in f] + for pdf_name in pdf_names: + fields = pdf_name.replace(".pdf", "").split("_") + if len(fields) < 2: + print(COLOUR_WARNING + + f"Warning: {pdf_name} has not been correctly identified. " + + "(unrecognized format #1)" + + COLOUR+DEFAULT) + elif fields[1].upper() == "ARXIV": + arxiv_id = "/".join(fields[2:]) + feed = get_arxiv_from_ids(arxiv_id) + entries = get_arxiv_entries(feed) + if len(entries) == 1: + entry = entries[0] + bibtex = arxiv_to_bibtex(entry, + arxtic_score=99, + arxtic_filename=pdf_name) + bibtex_list.append(bibtex) + else: + print(COLOUR_WARNING + + f"Warning: {pdf_name} has not been correctly identified. " + + "(ambiguous #1)" + + COLOUR_DEFAULT) + elif len(fields) == 5: + first_author = fields[0] + year = fields[1] + bibstem = fields[2] + volume = fields[3] + page = fields[4] + if bibstem == "AA": bibstem = "A&A" + query=(f"first_author:\"{first_author}\"" + f"year:({year})" + f"bibstem:\"{bibstem}\"" + f"volume:\"{volume}\"" + f"page:\"{page}\"") + feed = ads_search(query, num=2) + entries = get_ads_entries(feed) + if len(entries) == 1: + entry = entries[0] + bibtex = ads_to_bibtex(entry, + arxtic_score=99, + arxtic_filename=pdf_name) + bibtex_list.append(bibtex) + else: + print(COLOUR_WARNING + + f"Warning: {pdf_name} has not been correctly identified. " + + "(ambiguous #2)" + + COLOUR_DEFAULT) + else: + print(COLOUR_WARNING + + f"Warning: {pdf_name} has not been correctly identified. " + + "(unrecognized format #2)" + + COLOUR_DEFAULT) return None - -entries, fields, keywords = today_arxiv() + + +list_pdf() + +#entries, data = today_arxiv()