lemmy PESOS
If you like linkblogging, a cool Indieweb concept is PESOS: take your content that you post everywhere and pull it onto your own site. For me, these days, I do most of that with Lemmy. If you haven’t heard of Lemmy, it’s an “ActivityPub” “link aggregator”–so as Twitter is to Mastodon, Reddit is to Lemmy1.
The Indieweb folks properly prefer POSSE, where you take the content from your own site and push to things like Twitter… but there’s a reason that, outside of the fantastic functionality provided by brid.gy, I don’t do this: I hate auth flows! If I’m trying to get my 100% public content mirrored, I don’t want to have to auth for write access to Facebook or whatever. Therefore, I tend to set up simpler stuff that can scrape or fetch my public use from things like Last.fm or Ravelry and add it to my site–the relevant auth then just being that on the backing git repository. Much easier!
the Python script I use for Lemmy > Jekyll
I have this run every 40 minutes or so. My cron invocation wraps this in a shell script to add things in git, commit, and push, because that’s my site workflow–and I don’t build with this cron job–but that’s all up to you. There’s some deprecated stuff I’ve left in there about times that you can fix and use if you don’t want to take a dependency on dateutil
; it does not produce the correct times because timezones are terrible.
#!/usr/bin/python3
# pip3 install feedparser
import feedparser
import re
import time
from datetime import datetime, timezone
import os
#pip3 install python-dateutil
from dateutil import parser
from dateutil.tz import gettz
lemmy_instance = "dev.lemmy.ml"
user = "kixiQu"
exclude_this_site = "maya.land"
# I'm pretty sure they won't be changing this but you never know
url = f"https://{lemmy_instance}/feeds/u/{user}.xml?sort=New"
# This is a directory where copies of the posts live in order to keep
# track of which we've already processed (even if we later delete the
# copy in the Jekyll repo)
filedir = os.path.expanduser("~/rss")
# This is the end location of the post whence it is built into the site.
targetdir = os.path.expanduser("~/mayaland/_posts/")
if not (os.path.exists(filedir) and os.path.isdir(filedir)):
os.mkdir(filedir) # famous race condition, doesn't matter for us
# https://www.peterbe.com/plog/fastest-python-function-to-slugify-a-string
non_url_safe = ['"', '#', '$', '%', '&', '+',
',', '/', ':', ';', '=', '?', '(', ')',
'@', '[', '\\', ']', '^', '`',
'{', '|', '}', '~', "'"]
non_url_safe_regex = re.compile(
r'[{}]'.format(''.join(re.escape(x) for x in non_url_safe)))
def kebabify(text):
text = non_url_safe_regex.sub('', text).strip()
# let's do 8 words of the title
text = u'-'.join(re.split(r'\s+', text)[0:8])
return text
def get_filename_from_post(post):
date = get_date_from_time_string(post['published'])
keb = kebabify(post['title']).lower()
return f"{date}-{keb}.md"
def get_content_from_summary(summary):
# This is incredibly fragile!!!
# The relevant line is here:
# https://github.com/LemmyNet/lemmy/blob/eee394cf8510a42c0b772c420b0785965df4d0af/src/routes/feeds.rs#L364
last_chunk = r"comments</a>"
i = summary.index(last_chunk) + len(last_chunk)
if i < len(summary):
return summary[i:]
else:
return ""
# deprecated AF, don't use these, oh my tz struggles
def get_local_datetime_from_struct_time(t):
return datetime.fromtimestamp(
time.mktime(t)
).replace(
tzinfo=timezone.utc
).astimezone(tz=None)
def get_date_string_from_time(t):
dt = get_local_datetime_from_struct_time(t)
return dt.date().isoformat()
def get_full_date_from_time(t):
dt = get_local_datetime_from_struct_time(t)
return dt.isoformat()
# instead use these
def get_local_datetime_from_time_string(t):
return parser.parse(t).astimezone(tz=gettz("America/Los_Angeles"))
def get_date_from_time_string(t):
dt = get_local_datetime_from_time_string(t)
return dt.date().isoformat()
def get_full_date_from_time_string(t):
dt = get_local_datetime_from_time_string(t)
return dt.isoformat()
def get_community(tags):
if not tags:
return ""
d = tags[0]
if 'term' not in d:
return ""
t = d['term']
if ' ' not in t:
return t
else:
return t.split(' ')[0]
template = """---
title: "{title}"
layout: lemmy
community: {community}
category: responses
tags: from_lemmy bookmark
replyto: "{link}"
date: "{isodate}"
lemmylink: "{lemmy_link}"
---
{content}
"""
def save_post(path, file_content):
with open(path, 'w') as f:
f.write(file_content)
def get_file_string(post):
title = post['title']
link = post['link']
content = get_content_from_summary(post['summary'])
community = get_community(post['tags'])
# Call it ISO....ish
isodate = get_full_date_from_time_string(post['published'])
lemmy_link = post['id']
return template.format(title=title, link=link, content=content, community=community, isodate=isodate, lemmy_link=lemmy_link)
feed = feedparser.parse(url)
for post in feed.entries:
# no self posts; this is because I wanted to be able to post chatty things
if 'link' not in post or (('id' in post) and (post['link'] == post['id'])):
continue
# no recursive posts
url = post['link']
if exclude_this_site in url:
continue
filename = get_filename_from_post(post)
prospective_path = os.path.join(filedir, filename)
if not os.path.exists(prospective_path):
print(f"Detected new post: {post['title']}")
file_content = get_file_string(post)
save_post(prospective_path, file_content)
save_post(os.path.join(targetdir, filename), file_content)
-
I think this is pretty exciting for a lot of reasons. Reddit’s whole shtick makes it easy for uninvested lurkers to participate a bit and help sort content–but the whole cultural vibe Reddit has going on is very much Not My Scene, even when I try to be discerning in only participating in small subreddits. ↩