# -*- coding: utf-8 -*-
r"""
Parser for python source files
==============================
"""
# Created Sun Nov 27 14:03:07 2016
# Author: Óscar Nájera
from __future__ import division, absolute_import, print_function
import codecs
import ast
from io import BytesIO
import re
import tokenize
from textwrap import dedent
from sphinx.errors import ExtensionError
from sphinx.util.logging import getLogger
logger = getLogger('sphinx-gallery')
SYNTAX_ERROR_DOCSTRING = """
SyntaxError
===========
Example script with invalid Python syntax
"""
# The pattern for in-file config comments is designed to not greedily match
# newlines at the start and end, except for one newline at the end. This
# ensures that the matched pattern can be removed from the code without
# changing the block structure; i.e. empty newlines are preserved, e.g. in
#
# a = 1
#
# # sphinx_gallery_thumbnail_number = 2
#
# b = 2
FLAG_START = r"^[\ \t]*#\s*"
INFILE_CONFIG_PATTERN = re.compile(
FLAG_START + r"sphinx_gallery_([A-Za-z0-9_]+)(\s*=\s*(.+))?[\ \t]*\n?",
re.MULTILINE)
START_IGNORE_FLAG = FLAG_START + "sphinx_gallery_start_ignore"
END_IGNORE_FLAG = FLAG_START + "sphinx_gallery_end_ignore"
IGNORE_BLOCK_PATTERN = re.compile(
rf"{START_IGNORE_FLAG}(?:[\s\S]*?){END_IGNORE_FLAG}\n?",
re.MULTILINE)
[docs]def parse_source_file(filename):
"""Parse source file into AST node.
Parameters
----------
filename : str
File path
Returns
-------
node : AST node
content : utf-8 encoded string
"""
with codecs.open(filename, 'r', 'utf-8') as fid:
content = fid.read()
# change from Windows format to UNIX for uniformity
content = content.replace('\r\n', '\n')
try:
node = ast.parse(content)
return node, content
except SyntaxError:
return None, content
def _get_docstring_and_rest(filename):
"""Separate ``filename`` content between docstring and the rest.
Strongly inspired from ast.get_docstring.
Returns
-------
docstring : str
docstring of ``filename``
rest : str
``filename`` content without the docstring
lineno : int
The line number.
node : ast Node
The node.
"""
node, content = parse_source_file(filename)
if node is None:
return SYNTAX_ERROR_DOCSTRING, content, 1, node
if not isinstance(node, ast.Module):
raise ExtensionError("This function only supports modules. "
"You provided {0}"
.format(node.__class__.__name__))
if not (node.body and isinstance(node.body[0], ast.Expr) and
isinstance(node.body[0].value, ast.Str)):
raise ExtensionError(
'Could not find docstring in file "{0}". '
'A docstring is required by sphinx-gallery '
'unless the file is ignored by "ignore_pattern"'
.format(filename))
# Python 3.7+ way
docstring = ast.get_docstring(node)
assert docstring is not None # should be guaranteed above
# This is just for backward compat
if len(node.body[0].value.s) and node.body[0].value.s[0] == '\n':
# just for strict backward compat here
docstring = '\n' + docstring
ts = tokenize.tokenize(BytesIO(content.encode()).readline)
# find the first string according to the tokenizer and get its end row
for tk in ts:
if tk.exact_type == 3:
lineno, _ = tk.end
break
else:
lineno = 0
# This get the content of the file after the docstring last line
# Note: 'maxsplit' argument is not a keyword argument in python2
rest = '\n'.join(content.split('\n')[lineno:])
lineno += 1
return docstring, rest, lineno, node
[docs]def split_code_and_text_blocks(source_file, return_node=False):
"""Return list with source file separated into code and text blocks.
Parameters
----------
source_file : str
Path to the source file.
return_node : bool
If True, return the ast node.
Returns
-------
file_conf : dict
File-specific settings given in source file comments as:
``# sphinx_gallery_<name> = <value>``
blocks : list
(label, content, line_number)
List where each element is a tuple with the label ('text' or 'code'),
the corresponding content string of block and the leading line number
node : ast Node
The parsed node.
"""
docstring, rest_of_content, lineno, node = _get_docstring_and_rest(
source_file)
blocks = [('text', docstring, 1)]
file_conf = extract_file_config(rest_of_content)
pattern = re.compile(
r'(?P<header_line>^#{20,}.*|^# ?%%.*)\s(?P<text_content>(?:^#.*\s?)*)',
flags=re.M)
sub_pat = re.compile('^#', flags=re.M)
pos_so_far = 0
for match in re.finditer(pattern, rest_of_content):
code_block_content = rest_of_content[pos_so_far:match.start()]
if code_block_content.strip():
blocks.append(('code', code_block_content, lineno))
lineno += code_block_content.count('\n')
lineno += 1 # Ignored header line of hashes.
text_content = match.group('text_content')
text_block_content = dedent(re.sub(sub_pat, '', text_content)).lstrip()
if text_block_content.strip():
blocks.append(('text', text_block_content, lineno))
lineno += text_content.count('\n')
pos_so_far = match.end()
remaining_content = rest_of_content[pos_so_far:]
if remaining_content.strip():
blocks.append(('code', remaining_content, lineno))
out = (file_conf, blocks)
if return_node:
out += (node,)
return out
[docs]def remove_ignore_blocks(code_block):
"""
Return the content of *code_block* with ignored areas removed.
An ignore block starts with # sphinx_gallery_begin_ignore, and ends with
# sphinx_gallery_end_ignore. These lines and anything in between them will
be removed, but surrounding empty lines are preserved.
Parameters
----------
code_block : str
A code segment.
"""
num_start_flags = len(re.findall(START_IGNORE_FLAG, code_block))
num_end_flags = len(re.findall(END_IGNORE_FLAG, code_block))
if num_start_flags != num_end_flags:
raise ExtensionError(
'All "sphinx_gallery_start_ignore" flags must have a matching '
'"sphinx_gallery_end_ignore" flag!'
)
return re.subn(IGNORE_BLOCK_PATTERN, '', code_block)[0]