2025-10-09: Manual update

2026-03-14 22:06:27 +01:00 · 2025-10-09 16:14:16 +02:00
parent 2f5993471e
commit 6c17b76c37
4 changed files with 226 additions and 32 deletions
--- a/cfg/general_config.cfg
+++ b/cfg/general_config.cfg
@@ -31,6 +31,7 @@
 ## Directories
 VENV_DIR="./venv/"
 DB_DIR="./db/"
+PDF_DIR="../Bibliography/"
 FILTERS_DIR="./cfg/filters/"

 ## API Keys
@@ -41,4 +42,5 @@ ADSABS_API_KEY="$(cat cfg/adsabs.secret)"
 ARXIV_QUERY_URL="https://export.arxiv.org/api/query?"
 ARXIV_RSS_URL="https://rss.arxiv.org/atom/astro-ph"

-ADSABS_QUERY_URL="https://api.adsabs.harvard.edu/v1/search/query?q="
+ADSABS_QUERY_URL="https://api.adsabs.harvard.edu/v1/search/query?"
+ADSABS_EXPORT_URL="https://api.adsabs.harvard.edu/v1/export/bibtexabs/"
--- a/cfg/requirements.txt
+++ b/cfg/requirements.txt
@@ -1,3 +1,10 @@
+bibtexparser==2.0.0b8
+certifi==2025.10.5
+charset-normalizer==3.4.3
 feedparser==6.0.12
+idna==3.10
 numpy==2.3.3
+pylatexenc==3.0a33
+requests==2.32.5
 sgmllib3k==1.0.0
+urllib3==2.5.0
--- a/setup.sh
+++ b/setup.sh
@@ -30,7 +30,8 @@ source activate.sh
 echo "=== [ Directory setup ] ==="
 mkdir -p -v cfg/filters
 mkdir -p -v db
-touch db/saved.txt
+touch db/read.bib
+touch db/unread.bib
 chmod u+x *.sh
 echo "=== Done ==="

--- a/src/arxtic.py
+++ b/src/arxtic.py
@@ -33,12 +33,19 @@ along with this program. If not, see www.gnu.org/licenses/.
 import os
 import textwrap as tw
 import feedparser as fp
+import bibtexparser as bib
+import requests as rq
+import numpy as np
+from urllib.parse import urlencode, quote_plus

 FILTERS_DIR = os.environ.get("FILTERS_DIR")
 DB_DIR = os.environ.get("DB_DIR")
+PDF_DIR = os.environ.get("PDF_DIR")
 ARXIV_QUERY_URL = os.environ.get("ARXIV_QUERY_URL")
 ARXIV_RSS_URL = os.environ.get("ARXIV_RSS_URL")
 ADSABS_QUERY_URL = os.environ.get("ADSABS_QUERY_URL")
+ADSABS_EXPORT_URL = os.environ.get("ADSABS_EXPORT_URL")
+ADSABS_API_KEY = os.environ.get("ADSABS_API_KEY")

 COLOUR_DEFAULT="\033[0m"
 COLOUR_INPUT="\033[36m"
@@ -53,9 +60,7 @@ def wrap(txt, length=80):
    wrapped_txt = '\n'.join(tw.wrap(txt, length, break_long_words=False))
    return wrapped_txt

-def get_entries(rss):
-    entries = rss["entries"]
-    return entries
+## Filters

 def get_filters():
    filters = []
@@ -63,11 +68,14 @@ def get_filters():
    for i in range(len(filters_list)):
        path = FILTERS_DIR + filters_list[i]
        with open(path) as filter_file:
-            dic = {"fields": [], "values": []}
+            dic = {"fields": [], "values": [], "score": 1}
            for line in filter_file.readlines():
                if "#FIELD" in line:
                    field = line.split("=")[1].replace("\"", "").strip()
                    dic["fields"].append(field)
+                elif "#SCORE" in line:
+                    field = line.split("=")[1].strip()
+                    dic["score"] = int(field)
                elif line[0] == "#" or line in [" \n", "\n", ""]:
                    continue
                else:
@@ -76,36 +84,50 @@ def get_filters():
            filters.append(dic)
    return filters

-## ArXiV Entries
-
 def filter_entries(filters, entries):
    filtered_entries = []
    filtered_fields = []
    filtered_keywords = []
+    filtered_score = []
    for entry in entries:
        added = False
        for filter_ in filters:
            fields = filter_["fields"]
            values = filter_["values"]
+            score = filter_["score"]
            for field in fields:
                for value in values:
-                    if not added and value.upper() in str(entry[field]).upper():
+                    if field in list(entry):
+                        val = entry[field]
+                    else:
+                        val = ""
+                    if not added and value.upper() in str(val).upper():
                        filtered_entries.append(entry)
                        filtered_fields.append([field])
                        filtered_keywords.append([value])
+                        filtered_score.append(score)
                        added = True
-                    elif added and value.upper() in str(entry[field]).upper():
+                    elif added and value.upper() in str(val).upper():
+                        filtered_score[-1] = filtered_score[-1] + score
                        if not field in filtered_fields[-1]:
                            filtered_fields[-1].append(field)
                        if not value in filtered_keywords[-1]:
                            filtered_keywords[-1].append(value)
-    return filtered_entries, filtered_fields, filtered_keywords
+    filtered_data = {"fields": filtered_fields,
+                     "keywords": filtered_keywords,
+                     "score": filtered_score}
+    return filtered_entries, filtered_data

-def print_entries(entries, fields=None, keywords=None):
+##  Print entries
+
+def print_entries(entries, data=None):
    for i in range(len(entries)):
        entry = entries[i]
        
        print(COLOUR_INFO, end="")
+        if "bibcode" in list(entry):
+            print(entry["bibcode"], end="")
+        if "id" in list(entry):
            print(entry["id"], end="")
        if "arxiv_announce_type" in list(entry) : 
            print(" (" + entry["arxiv_announce_type"] + ")", end="")
@@ -114,25 +136,28 @@ def print_entries(entries, fields=None, keywords=None):

        print(COLOUR_DEFAULT + wrap(entry["title"]) + COLOUR_DEFAULT)
        print(COLOUR_OUTPUT 
-              + wrap(", ".join([a["name"] for a in entry["authors"]]))
+              + wrap(", ".join(entry["author"]))
              + COLOUR_DEFAULT)
        print(COLOUR_INPUT 
-              + wrap("\n".join(entry["summary"].split("\n")[1:]))
+              + wrap(entry["abstract"])
              + COLOUR_DEFAULT)
-        if fields is not None:
+        if data is not None:
            print(COLOUR_ERROR 
                  + "Filtered field(s): " 
-                  + ", ".join(fields[i]) 
+                  + ", ".join(data["fields"][i]) 
                  + COLOUR_DEFAULT)
-        if keywords is not None:
            print(COLOUR_ERROR 
                  + "Filtered keyword(s): " 
-                  + ", ".join(keywords[i]) 
+                  + ", ".join(data["keywords"][i]) 
+                  + COLOUR_DEFAULT)
+            print(COLOUR_ERROR 
+                  + "Filtered score: " 
+                  + str(data["score"][i]) 
                  + COLOUR_DEFAULT)
        print("")
    return 0

-# ArXiV IDs
+# IDs

 def get_arxiv_ids(entries):
    ids = []
@@ -169,12 +194,10 @@ def get_arxiv_rss():
 def today_arxiv():
    filters = get_filters()
    feed = get_arxiv_rss()
-    entries = get_entries(feed)
-    entries, fields, keywords = filter_entries(filters, entries)
-    ids = get_arxiv_ids(entries)
-    save_arxiv_ids(ids)
-    print_entries(entries, fields, keywords)
-    return entries, fields, keywords
+    entries = get_arxiv_entries(feed)
+    entries, data = filter_entries(filters, entries)
+    print_entries(entries, data)
+    return entries, data

 def get_arxiv_from_ids(ids):
    if isinstance(ids, list) or isinstance(ids, np.ndarray):
@@ -185,13 +208,174 @@ def get_arxiv_from_ids(ids):
        raise Exception(
                "The type of ids ({}) is not recognized".format(type(ids))
                )
-    query = ARXIV_QUERY_URL + "id_list=" + ",".join(ids)
-    feed = fp.parse(query)
+    query = urlencode({"id_list": ",".join(ids)})
+    url = ARXIV_QUERY_URL + query
+    feed = fp.parse(url)
    return feed

 ## ADS-ABS

-def get_adsabs_from_ids(ids):
+def ads_search(query, num=5, sort="date"):
+    query = urlencode({"q": query, 
+                       "fl": ("bibcode,title,author,abstract,bibstem,doi,"
+                              "keyword,citation,pubdate"), 
+                       "rows": num, 
+                       "sort": sort})
+    url = ADSABS_QUERY_URL + query
+    header = "Bearer " + ADSABS_API_KEY
+    feed = rq.get(url, headers={'Authorization': header})
+    return feed
+
+def ads_author(author, num=10, sort="date"):
+    filters = get_filters()
+    feed = ads_search("author:" + author, num=num, sort=sort)
+    entries = get_ads_entries(feed)
+    entries, data = filter_entries(filters, entries)
+    print_entries(entries, data)
+    return entries, data
+
+# Entries
+
+def get_arxiv_entries(rss):
+    entries_old = rss["entries"]
+    entries = []
+    for entry_old in entries_old:
+        entry = {}
+        entry["id"] = entry_old["id"].replace("oai:", "").replace("arXiv.org:", "")
+        entry["link"] = entry_old["link"]
+        entry["title"] = entry_old["title"]
+        tmp = []
+        for element in entry_old["authors"]:
+            if isinstance(element, dict):
+                tmp += element["name"].split(",")
+        entry["author"] = [a.strip() for a in tmp]
+        entry["abstract"] = "\n".join(entry_old["summary"].split("\n")[1:])[10:]
+        entry["pubdate"] = entry_old["published"][0:10]
+        entries.append(entry)
+    return entries
+
+
+def get_ads_entries(feed):
+    num = len(feed.json()["response"]["docs"])
+    entries = []
+    for i in range(num):
+        entry = feed.json()["response"]["docs"][i]
+        entry["link"] = "https://ui.adsabs.harvard.edu/abs/" + entry["bibcode"]
+        entry["title"] = entry["title"][0]
+        entry["publisher"] = entry["bibstem"][0]
+        entries.append(entry)
+    return entries
+
+# BibTeX
+
+def arxiv_to_bibtex(entry, 
+                    arxtic_notes = "", 
+                    arxtic_category = "",
+                    arxtic_keywords = "",
+                    arxtic_score = 0,
+                    arxtic_filename = ""):
+    key = entry["id"]
+    title = entry["title"]
+    author = " and ".join(entry["author"])
+    year = entry["pubdate"][0:4]
+    eprint = key
+    url = entry["link"]
+    bibentry = (f"@misc{{{key},\n"
+                f"\ttitle={{{title}}},\n"
+                f"\tauthor={{{author}}},\n"
+                f"\tyear={{{year}}},\n"
+                f"\teprint={{{eprint}}},\n"
+                f"\turl={{{url}}},\n"
+                f"\tarxtic_notes={{{arxtic_notes}}},\n"
+                f"\tarxtic_category={{{arxtic_category}}},\n"
+                f"\tarxtic_keywords={{{arxtic_keywords}}},\n"
+                f"\tarxtic_score={{{str(arxtic_score)}}},\n"
+                 f"\tarxtic_filename={{{str(arxtic_filename)}}},\n"
+                "}")
+    bibtex = bib.parse_string(bibentry)
+    return bibtex
+
+def ads_to_bibtex(entry, 
+                  arxtic_notes = "", 
+                  arxtic_category = "",
+                  arxtic_keywords = "",
+                  arxtic_score = 0,
+                  arxtic_filename = ""):
+    bibcode = entry["bibcode"]
+    url = ADSABS_EXPORT_URL + bibcode
+    header = "Bearer " + ADSABS_API_KEY
+    feed = rq.get(url, headers={'Authorization': header})
+    bibentry = feed.text
+    bibentry = bibentry[:-2]
+    bibentry += (",\n"
+                 f"\tarxtic_notes={{{arxtic_notes}}},\n"
+                 f"\tarxtic_category={{{arxtic_category}}},\n"
+                 f"\tarxtic_keywords={{{arxtic_keywords}}},\n"
+                 f"\tarxtic_score={{{str(arxtic_score)}}},\n"
+                 f"\tarxtic_filename={{{str(arxtic_filename)}}},\n"
+                 "}")
+    bibtex = bib.parse_string(bibentry)
+    return bibtex
+
+def list_pdf():
+    bibtex_list = []
+    pdf_names = [f for f in os.listdir(PDF_DIR) 
+                 if not f[0] == "." and ".pdf" in f]
+    for pdf_name in pdf_names:
+        fields = pdf_name.replace(".pdf", "").split("_")
+        if len(fields) < 2:
+            print(COLOUR_WARNING 
+                  + f"Warning: {pdf_name} has not been correctly identified. "
+                  + "(unrecognized format #1)"
+                  + COLOUR+DEFAULT)
+        elif fields[1].upper() == "ARXIV":
+            arxiv_id = "/".join(fields[2:])
+            feed = get_arxiv_from_ids(arxiv_id)
+            entries = get_arxiv_entries(feed)
+            if len(entries) == 1:
+                entry = entries[0]
+                bibtex = arxiv_to_bibtex(entry, 
+                                         arxtic_score=99, 
+                                         arxtic_filename=pdf_name)
+                bibtex_list.append(bibtex)
+            else:
+                print(COLOUR_WARNING 
+                      + f"Warning: {pdf_name} has not been correctly identified. "
+                      + "(ambiguous #1)"
+                      + COLOUR_DEFAULT)
+        elif len(fields) == 5:
+            first_author = fields[0]
+            year = fields[1]
+            bibstem = fields[2]
+            volume = fields[3]
+            page = fields[4]
+            if bibstem == "AA": bibstem = "A&A"
+            query=(f"first_author:\"{first_author}\""
+                   f"year:({year})"
+                   f"bibstem:\"{bibstem}\""
+                   f"volume:\"{volume}\""
+                   f"page:\"{page}\"")
+            feed = ads_search(query, num=2)
+            entries = get_ads_entries(feed)
+            if len(entries) == 1:
+                entry = entries[0]
+                bibtex = ads_to_bibtex(entry, 
+                                       arxtic_score=99, 
+                                       arxtic_filename=pdf_name)
+                bibtex_list.append(bibtex)
+            else:
+                print(COLOUR_WARNING 
+                      + f"Warning: {pdf_name} has not been correctly identified. "
+                      + "(ambiguous #2)"
+                      + COLOUR_DEFAULT)
+        else:
+            print(COLOUR_WARNING 
+                  + f"Warning: {pdf_name} has not been correctly identified. "
+                  + "(unrecognized format #2)"
+                  + COLOUR_DEFAULT)
    return None

-entries, fields, keywords = today_arxiv()
+
+list_pdf()
+
+#entries, data = today_arxiv()