Source code for evileg_core.utils

# -*- coding: utf-8 -*-

import re

import markdown
import six
from bs4 import BeautifulSoup
from django.conf import settings
from django.contrib.auth.models import Group, Permission
from django.utils.functional import lazy
from django.utils.http import is_safe_url, urlunquote
from django.utils.safestring import mark_safe

from .shortcuts import get_object_or_none

mark_safe_lazy = lazy(mark_safe, six.text_type)


[docs]class EImageUrlsGetter: __slots__ = ['soup'] def __init__(self, text): self.soup = BeautifulSoup(text, "lxml") if text else None
[docs] def handle(self): if self.soup: src_list = set() for tag in self.soup.find_all('img'): src_attr = tag.get('src') if src_attr and src_attr.startswith('/media/'): src_list.add(src_attr) return src_list return set()
[docs] @classmethod def get_urls(cls, text): soup = EImageUrlsGetter(text=text) return soup.handle()
[docs]class ESoup: __slots__ = ['soup', 'tags_for_extracting', 'dofollow', 'add_header_anchors'] """ Clean up class for extracting unwanted content from text, which was posted by users """ def __init__(self, text, tags_for_extracting=(), dofollow=False, add_header_anchors=False): self.soup = BeautifulSoup(text, "lxml") if text else None self.tags_for_extracting = ('script', 'style',) + tags_for_extracting self.dofollow = dofollow self.add_header_anchors = add_header_anchors def _add_header_anchors(self, soup): for header_tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): anchor = soup.new_tag('a') anchor['class'] = 'anchor' anchor['id'] = 'header_{}'.format(header_tag.text.replace(' ', '_')) header_tag.insert(0, anchor) return soup def _extract_tags(self, soup, tags=()): for tag in tags: for current_tag in soup.find_all(tag): current_tag.extract() return soup def _remove_attrs(self, soup): for tag in soup.find_all(True): tag.attrs = {} return soup def _remove_all_attrs_except(self, soup, whitelist_tags=()): for tag in soup.find_all(True): if tag.name not in whitelist_tags: tag.attrs = {} return soup def _remove_all_attrs_except_saving(self, soup, whitelist_tags=(), whitelist_attrs=(), whitelist_classes=()): for tag in soup.find_all(True): saved_classes = [] if tag.has_attr('class'): classes = tag['class'] for class_str in whitelist_classes: if class_str in classes: saved_classes.append(class_str) if tag.name not in whitelist_tags: tag.attrs = {} else: attrs = dict(tag.attrs) for attr in attrs: if attr not in whitelist_attrs: del tag.attrs[attr] if len(saved_classes) > 0: tag['class'] = ' '.join(saved_classes) return soup def _add_rel_attr(self, soup, tag, attr): site_url = getattr(settings, "SITE_URL", '/') for tag in soup.find_all(tag): attr_content = tag.get(attr) if attr_content and not attr_content.startswith(site_url) and not attr_content.startswith('/'): tag['rel'] = ['nofollow'] return soup def _add_class_attr(self, soup, tag, classes=()): for tag in soup.find_all(tag): saved_classes = [] if tag.has_attr('class'): saved_classes.append(tag['class']) saved_classes.extend(list(classes)) tag['class'] = ' '.join(saved_classes) return soup def _add_attr(self, soup, tag, attr, value): for tag in soup.find_all(tag): tag[attr] = value return soup def _correct_url(self, soup, tag, attr): site_url = getattr(settings, "SITE_URL", None) languages = getattr(settings, "LANGUAGES", None) if site_url is not None and languages is not None and len(languages) > 1: site_url_parser = re.compile('({})'.format('|'.join(['^{}/{}'.format(site_url, code) for code, language in languages]))) relational_url_parser = re.compile('({})'.format('|'.join(['^/{}'.format(code) for code, language in languages]))) for tag in soup.find_all(tag): attr_content = tag.get(attr) if attr_content: attr_content = site_url_parser.sub(site_url, attr_content) attr_content = relational_url_parser.sub('', attr_content) tag[attr] = attr_content return soup def _change_tag_name(self, soup, old_tag, new_tag): for tag in soup.find_all(old_tag): tag.name = new_tag return soup
[docs] def clean(self): if self.soup: soup = self._extract_tags(soup=self.soup, tags=self.tags_for_extracting) soup = self._remove_all_attrs_except_saving( soup=soup, whitelist_tags=('img', 'a', 'iframe'), whitelist_attrs=('src', 'href', 'name', 'width', 'height', 'alt'), whitelist_classes=( 'youtube-wrapper', 'youtube-iframe', 'prettyprint', 'lang-bsh', 'lang-c', 'lang-cc', 'lang-cpp', 'lang-cs', 'lang-csh', 'lang-cyc', 'lang-cv', 'lang-htm', 'lang-html', 'lang-java', 'lang-js', 'lang-m', 'lang-mxml', 'lang-perl', 'lang-pl', 'lang-pm', 'lang-py', 'lang-rb', 'lang-sh', 'lang-xhtml', 'lang-xml', 'lang-xsl' ) ) if not self.dofollow: soup = self._add_rel_attr(soup=soup, tag='a', attr='href') soup = self._add_rel_attr(soup=soup, tag='img', attr='src') soup = self._correct_url(soup=soup, tag='a', attr='href') soup = self._correct_url(soup=soup, tag='img', attr='src') soup = self._add_attr(soup=soup, tag='img', attr='loading', value='lazy') soup = self._add_class_attr(soup=soup, tag='img', classes=('img-fluid',)) soup = self._add_class_attr(soup=soup, tag='table', classes=('table', 'table-bordered', 'table-hover')) soup = self._add_class_attr(soup=soup, tag='code', classes=('prettyprint linenums',)) soup = self._change_tag_name(soup=soup, old_tag='code', new_tag='pre') if self.add_header_anchors: soup = self._add_header_anchors(soup=soup) return re.sub('<body>|</body>', '', soup.body.prettify()) return ''
[docs] @classmethod def clean_text(cls, text, tags_for_extracting=(), dofollow=False, add_header_anchors=False): soup = ESoup(text=text, tags_for_extracting=tags_for_extracting, dofollow=dofollow, add_header_anchors=add_header_anchors) return soup.clean()
[docs]def set_adding_header_anchors(model, add_header_anchors=True, field_name='content_markdown'): model._meta.get_field(field_name).add_header_anchors = add_header_anchors for code, language in getattr(settings, "LANGUAGES", []): model._meta.get_field('{}_{}'.format(field_name, code)).add_header_anchors = add_header_anchors
[docs]class EMarkdownWorker: __slots__ = ['pre_markdown_text', 'markdown_text'] """ Markdown converter. It will convert markdown text to html text with clean up from unwanted content, using ESoap class """ def __init__(self, text): self.pre_markdown_text = text self.markdown_text = None self.make_html_from_markdown()
[docs] def make_html_from_markdown(self): if self.pre_markdown_text: self.markdown_text = markdown.markdown( self.pre_markdown_text, extensions=['markdown.extensions.attr_list', 'markdown.extensions.tables', 'markdown.extensions.fenced_code', 'markdown.extensions.nl2br', 'evileg_core.extensions.video', 'superscript', 'subscript'], output_format='html5' )
[docs] def get_text(self, dofollow=False, add_header_anchors=False): return ESoup.clean_text(text=self.markdown_text, dofollow=dofollow, add_header_anchors=add_header_anchors)
[docs]def get_next_url(request): """ Get next url from request. For example it needed for return to previous url. :param request: HTTP request :return: String """ next = request.META.get('HTTP_REFERER') if next: next = urlunquote(next) # HTTP_REFERER may be encoded. if not is_safe_url(url=next, allowed_hosts=request.get_host()): next = '/' return next
[docs]def get_client_ip(request): """ Get client ip address from HTTP request :param request: HTTP request :return: IP Address """ x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR') return x_forwarded_for.split(',')[-1].strip() if x_forwarded_for else request.META.get('REMOTE_ADDR')
[docs]def generate_groups_with_permissions(groups_permissions): for group_name in groups_permissions: group, created = Group.objects.get_or_create(name=group_name) for model_cls in groups_permissions[group_name]: for perm_index, perm_name in enumerate(groups_permissions[group_name][model_cls]): codename = perm_name + "_" + model_cls._meta.model_name permission = get_object_or_none(klass=Permission, codename=codename) if permission: group.permissions.add(permission)