Python源码示例:pypandoc.convert_text()
示例1
def _convert_md_table_to_rst(table):
"""Convert a markdown table to rst format"""
if len(table) < 3:
return ''
out = '```eval_rst\n.. list-table::\n :header-rows: 1\n\n'
for i,l in enumerate(table):
cols = l.split('|')[1:-1]
if i == 0:
ncol = len(cols)
else:
if len(cols) != ncol:
return ''
if i == 1:
for c in cols:
if len(c) is not 0 and '---' not in c:
return ''
else:
for j,c in enumerate(cols):
out += ' * - ' if j == 0 else ' - '
out += pypandoc.convert_text(
c, 'rst', format='md').replace('\n', ' ').replace('\r', '') + '\n'
out += '```\n'
return out
示例2
def create(self, variables, md_output, pdf_output):
env = Environment(loader=PackageLoader('qanta', 'reporting/templates'))
template = env.get_template(self.template)
markdown = template.render(variables)
if md_output is not None:
with open(md_output, 'w') as f:
f.write(markdown)
try:
import pypandoc
pypandoc.convert_text(
markdown,
'pdf',
format='md',
outputfile=pdf_output,
extra_args=['-V', 'geometry:margin=.75in']
)
except Exception as e:
log.warn('Pandoc was not installed or there was an error calling it, omitting PDF report')
log.warn(str(e))
示例3
def convert_issue_data(self, redmine_issue):
"""
Generate the data for a new GitHub issue
"""
description_md = convert_text(
redmine_issue['description'], 'markdown_github', 'textile'
)
porting_note = '###### ported from Redmine #%s (created %s)' % (
redmine_issue['id'],
redmine_issue['created_on'].split('T')[0]
)
if self.is_closed(redmine_issue):
porting_note = '%s (CLOSED %s)' % (
porting_note,
redmine_issue['closed_on'].split('T')[0]
)
body = "%s\n\n%s" % (porting_note, description_md)
title = "%(subject)s (RM#%(id)s)" % redmine_issue
return {
"title": title,
"body": body,
"assignees": ["adam-iris"],
}
示例4
def fill_notebook(work_notebook, script_blocks, gallery_conf):
"""Writes the Jupyter notebook cells
If available, uses pypandoc to convert rst to markdown.
Parameters
----------
script_blocks : list
Each list element should be a tuple of (label, content, lineno).
"""
for blabel, bcontent, lineno in script_blocks:
if blabel == 'code':
add_code_cell(work_notebook, bcontent)
else:
if gallery_conf["pypandoc"] is False:
markdown = rst2md(bcontent + '\n')
else:
import pypandoc
# pandoc automatically addds \n to the end
markdown = pypandoc.convert_text(
bcontent, to='md', format='rst', **gallery_conf["pypandoc"]
)
add_markdown_cell(work_notebook, markdown)
示例5
def _convert_md_table_to_rst(table):
"""Convert a markdown table to rst format"""
if len(table) < 3:
return ''
out = '```eval_rst\n.. list-table::\n :header-rows: 1\n\n'
for i,l in enumerate(table):
cols = l.split('|')[1:-1]
if i == 0:
ncol = len(cols)
else:
if len(cols) != ncol:
return ''
if i == 1:
for c in cols:
if len(c) is not 0 and '---' not in c:
return ''
else:
for j,c in enumerate(cols):
out += ' * - ' if j == 0 else ' - '
out += pypandoc.convert_text(
c, 'rst', format='md').replace('\n', ' ').replace('\r', '') + '\n'
out += '```\n'
return out
示例6
def rst_to_notebook(infile, outfile, diridx=False):
"""Convert an rst file to a notebook file."""
# Read infile into a string
with open(infile, 'r') as fin:
rststr = fin.read()
# Convert string from rst to markdown
mdfmt = 'markdown_github+tex_math_dollars+fenced_code_attributes'
mdstr = pypandoc.convert_text(rststr, mdfmt, format='rst',
extra_args=['--atx-headers'])
# In links, replace .py extensions with .ipynb
mdstr = re.sub(r'\(([^\)]+).py\)', r'(\1.ipynb)', mdstr)
# Links to subdirectories require explicit index file inclusion
if diridx:
mdstr = re.sub(r']\(([^\)/]+)\)', r'](\1/index.ipynb)', mdstr)
# Enclose the markdown within triple quotes and convert from
# python to notebook
mdstr = '"""' + mdstr + '"""'
nb = py2jn.py_string_to_notebook(mdstr)
py2jn.tools.write_notebook(nb, outfile, nbver=4)
示例7
def pandoc_process(app, what, name, obj, options, lines):
""""Convert docstrings in Markdown into reStructureText using pandoc
"""
if not lines:
return None
input_format = app.config.mkdsupport_use_parser
output_format = 'rst'
# Since default encoding for sphinx.ext.autodoc is unicode and pypandoc.convert_text, which will always return a
# unicode string, expects unicode or utf-8 encodes string, there is on need for dealing with coding
text = SEP.join(lines)
text = pypandoc.convert_text(text, output_format, format=input_format)
# The 'lines' in Sphinx is a list of strings and the value should be changed
del lines[:]
lines.extend(text.split(SEP))
示例8
def _convert_md_table_to_rst(table):
"""Convert a markdown table to rst format"""
if len(table) < 3:
return ''
out = '```eval_rst\n.. list-table::\n :header-rows: 1\n\n'
for i,l in enumerate(table):
cols = l.split('|')[1:-1]
if i == 0:
ncol = len(cols)
else:
if len(cols) != ncol:
return ''
if i == 1:
for c in cols:
if len(c) is not 0 and '---' not in c:
return ''
else:
for j,c in enumerate(cols):
out += ' * - ' if j == 0 else ' - '
out += pypandoc.convert_text(
c, 'rst', format='md').replace('\n', ' ').replace('\r', '') + '\n'
out += '```\n'
return out
示例9
def convert(content, from_format, to_format, use_file=False):
if use_file:
filename = make_file(to_format)
else:
filename = None
output = pypandoc.convert_text(
content, to_format, format=from_format, outputfile=filename)
if use_file:
content = read_file(filename)
try:
return content.decode('UTF-8')
except UnicodeDecodeError:
return content.decode('latin-1')
else:
return output
示例10
def __init__(self, source_data):
try:
import pypandoc
except ImportError as e:
# pypandoc package may do not installed in the system since the package is
# an optional dependency
raise PypandocImportError(e)
super().__init__(pypandoc.convert_text(source_data, "html", format="mediawiki"))
示例11
def convert_rst_to_md(text):
return pypandoc.convert_text(
text, "md", format="rst", extra_args=["--wrap=preserve"]
)
示例12
def html2markdown(html: str) -> str:
"""
Returns the given HTML as equivalent Markdown-structured text.
"""
try:
return pypandoc.convert_text(html, 'md', format='html')
except OSError:
msg = "It's recommended to install the `pandoc` library for converting " \
"HTML into Markdown-structured text. It tends to have better results" \
"than `html2text`, which is now used as a fallback."
print(msg)
return html2text(html)
示例13
def md2rst(comment):
"""Convert a comment from protobuf markdown to restructuredtext.
This method:
- Replaces proto links with literals (e.g. [Foo][bar.baz.Foo] -> `Foo`)
- Resolves relative URLs to https://cloud.google.com
- Runs pandoc to convert from markdown to restructuredtext
"""
comment = _replace_proto_link(comment)
comment = _replace_relative_link(comment)
# Calling pypandoc.convert_text is slow, so we try to avoid it if there are
# no special characters in the markdown.
if any([i in comment for i in '`[]*_']):
comment = pypandoc.convert_text(comment, 'rst', format='commonmark')
# Comments are now valid restructuredtext, but there is a problem. They
# are being inserted back into a descriptor set, and there is an
# expectation that each line of a comment will begin with a space, to
# separate it from the '//' that begins the comment. You would think
# that we could ignore this detail, but it will cause formatting
# problems down the line in gapic-generator because parsing code will
# try to remove the leading space, affecting the indentation of lines
# that actually do begin with a space, so we insert the additional
# space now. Comments that are not processed by pypandoc will already
# have a leading space, so should not be changed.
comment = _insert_spaces(comment)
return comment
示例14
def read(self, contents, context=None):
assert isinstance(contents, str)
js = pypandoc.convert_text(contents, 'json', format=PANDOC_MARKDOWN_FORMAT)
ast = ASTPlugin().loads(js)
return ast
示例15
def get_pandoc_api_version():
import pypandoc
return json.loads(pypandoc.convert_text('', 'json', format='markdown'))['pandoc-api-version']
示例16
def html2markdown(html):
"""Converts `html` to Markdown-formatted text
"""
markdown_text = pypandoc.convert_text(html, 'markdown_strict', format='html')
return markdown_text
示例17
def convert_rst_to_md(text):
return pypandoc.convert_text(
text, "md", format="rst", extra_args=["--wrap=preserve"]
)
示例18
def convert_rst_to_md(text):
return pypandoc.convert_text(
text, "md", format="rst", extra_args=["--wrap=preserve"]
)
示例19
def twlight_wikicode2html(value):
"""Passes string through pandoc and returns html"""
output = pypandoc.convert_text(value, "html", format="mediawiki")
return output
示例20
def twlight_wikicode2html(value):
"""Passes string through pandoc and returns html"""
output = pypandoc.convert_text(value, "html", format="mediawiki")
return output
示例21
def pandoc_convert(text, to="html5", args=[], outputfile=None):
fr = Settings.new().get_value('input-format').get_string() or "markdown"
args.extend(["--quiet"])
return pypandoc.convert_text(text, to, fr, extra_args=args, outputfile=outputfile)
示例22
def convert(source: str, to: str, extra_args=(),
output_file: str=None) -> None:
"""
Convert a source document to an output file.
Parameters
----------
source : str
to : str
extra_args : iterable
output_file : str
Notes
-----
Either writes to ``output_file`` or prints to stdout.
"""
output_name = (
os.path.splitext(os.path.basename(output_file))[0]
if output_file is not None
else 'std_out'
)
standalone = '--standalone' in extra_args
self_contained = '--self-contained' in extra_args
use_prompt = '--use-prompt' in extra_args
extra_args = [item for item in extra_args if item != '--use-prompt']
stitcher = Stitch(name=output_name, to=to, standalone=standalone,
self_contained=self_contained, use_prompt=use_prompt)
result = stitcher.stitch(source)
result = json.dumps(result)
newdoc = pypandoc.convert_text(result, to, format='json',
extra_args=extra_args,
outputfile=output_file)
if output_file is None:
print(newdoc)
示例23
def tokenize(source: str) -> dict:
"""
Convert a document to pandoc's JSON AST.
"""
return json.loads(pypandoc.convert_text(source, 'json', 'markdown'))
示例24
def tokenize_block(source: str, pandoc_extra_args: list=None) -> list:
"""
Convert a Jupyter output to Pandoc's JSON AST.
"""
if pandoc_extra_args is None:
pandoc_extra_args = []
json_doc = pypandoc.convert_text(source, to='json', format='markdown', extra_args=pandoc_extra_args)
return json.loads(json_doc)['blocks']
示例25
def as_json(document):
"JSON representation of the markdown document"
return json.loads(pypandoc.convert_text(document, 'json',
format='markdown'))
示例26
def handle_law_from_xml(self, book, book_xml) -> LawBook:
previous_law = None
law_order = 1
# Parse XML tree
tree = etree.fromstring(book_xml)
for sect in tree.xpath('sect1'):
section_title = sect.xpath('title/text()')[0]
logger.debug('Section: %s' % section_title)
# if section_title == 'Grundgesetz für die Bundesrepublik Deutschland':
# continue
book.add_section(from_order=law_order, title=section_title.strip())
for law_key, law_raw in enumerate(sect.xpath('sect2')):
law_title = law_raw.xpath('title')[0]
law_title.getparent().remove(law_title)
# law_docbook = tostring(law_raw).decode('utf-8')
law_docbook = '\n'.join(tostring(x).decode('utf-8') for x in law_raw.iterchildren())
law_text = pypandoc.convert_text(law_docbook, 'html', format='docbook')
law_section = tostring(law_title, method="text").decode('utf-8').strip()
law = Law(book=book,
title="',
section=law_section,
slug=slugify(law_section),
content=law_text,
previous=previous_law,
order=law_order
)
law.save()
law_order += 1
previous_law = law
return book
示例27
def render_to_format(request, format, title, template_src, context):
if format in dict(settings.EXPORT_FORMATS):
# render the template to a html string
template = get_template(template_src)
html = template.render(context)
# remove empty lines
html = os.linesep.join([line for line in html.splitlines() if line.strip()])
if format == 'html':
# create the response object
response = HttpResponse(html)
else:
if format == 'pdf':
# check pandoc version (the pdf arg changed to version 2)
if pypandoc.get_pandoc_version().split('.')[0] == '1':
args = ['-V', 'geometry:margin=1in', '--latex-engine=xelatex']
else:
args = ['-V', 'geometry:margin=1in', '--pdf-engine=xelatex']
content_disposition = 'filename="%s.%s"' % (title, format)
else:
args = []
content_disposition = 'attachment; filename="%s.%s"' % (title, format)
# use reference document for certain file formats
refdoc = set_export_reference_document(format)
if refdoc is not None and (format == 'docx' or format == 'odt'):
if pypandoc.get_pandoc_version().startswith("1"):
refdoc_param = '--reference-' + format + '=' + refdoc
args.extend([refdoc_param])
else:
refdoc_param = '--reference-doc=' + refdoc
args.extend([refdoc_param])
# create a temporary file
(tmp_fd, tmp_filename) = mkstemp('.' + format)
log.info("Export " + format + " document using args " + str(args))
# convert the file using pandoc
pypandoc.convert_text(html, format, format='html', outputfile=tmp_filename, extra_args=args)
# read the temporary file
file_handler = os.fdopen(tmp_fd, 'rb')
file_content = file_handler.read()
file_handler.close()
# delete the temporary file
os.remove(tmp_filename)
# create the response object
response = HttpResponse(file_content, content_type='application/%s' % format)
response['Content-Disposition'] = content_disposition.encode('utf-8')
return response
else:
return HttpResponseBadRequest(_('This format is not supported.'))
示例28
def gen_release_notes(path):
"""
Generate reStructuredText files for "Release Notes". It generates 'index.rst' file and
each rst file for each version's release note under 'whatsnew' directory.
The contents are from Github release notes.
"""
whatsnew_dir = "%s/whatsnew" % path
shutil.rmtree(whatsnew_dir, ignore_errors=True)
os.mkdir(whatsnew_dir)
with open("%s/index.rst" % whatsnew_dir, "a") as index_file:
title = "Release Notes"
index_file.write("=" * len(title))
index_file.write("\n")
index_file.write(title)
index_file.write("\n")
index_file.write("=" * len(title))
index_file.write("\n")
index_file.write("\n")
index_file.write(".. toctree::")
index_file.write(" :maxdepth: 1")
index_file.write("\n")
index_file.write("\n")
for name, tag_name, body in list_releases_to_document(ks.__version__):
release_doc = pypandoc.convert_text(body, "rst", format="md")
# Make PR reference link pretty.
# Replace ", #..." to ", `...<https://github.com/databricks/koalas/pull/...>`_"
release_doc = re.sub(
r', #(\d+)',
r', `#\1 <https://github.com/databricks/koalas/pull/\1>`_', release_doc)
# Replace "(#..." to "(`...<https://github.com/databricks/koalas/pull/...>`_"
release_doc = re.sub(
r'\(#(\d+)',
r'(`#\1 <https://github.com/databricks/koalas/pull/\1>`_', release_doc)
index_file.write(" " + tag_name)
index_file.write("\n")
index_file.write("\n")
with open("%s/%s.rst" % (whatsnew_dir, tag_name), "a") as release_file:
release_file.write("=" * len(name))
release_file.write("\n")
release_file.write(name)
release_file.write("\n")
release_file.write("=" * len(name))
release_file.write("\n")
release_file.write("\n")
release_file.write(release_doc)
release_file.write("\n")
release_file.write("\n")
示例29
def convert(self, text):
text = '\n\n'.join([re.sub(self.regexCodeBlock, r'<pre>\1</pre>', block) for block in text.split('\n\n')])
collapseResults = re.findall(self.regexCollapse, text)
if len(collapseResults) > 0:
for i in range(0, len(collapseResults)):
text = text.replace(collapseResults[i][0], "<details>")
text = text.replace(collapseResults[i][2], "<summary>{}</summary> \n\n{}".format(collapseResults[i][1], collapseResults[i][2]))
text = text.replace(collapseResults[i][3], "</details>")
text = re.sub(self.regexParagraph, "", text)
# convert from textile to markdown
try:
text = pypandoc.convert_text(text, 'markdown_strict', format='textile')
# pandoc does not convert everything, notably the [[link|text]] syntax
# is not handled. So let's fix that.
# [[ wikipage | link_text ]] -> [link_text](wikipage)
text = re.sub(self.regexWikiLinkWithText, self.wiki_link, text, re.MULTILINE | re.DOTALL)
# [[ link_url ]] -> [link_url](link_url)
text = re.sub(self.regexWikiLinkWithoutText, self.wiki_link, text, re.MULTILINE | re.DOTALL)
# nested lists, fix at least the common issues
text = text.replace(" \\#\\*", " -")
text = text.replace(" \\*\\#", " 1.")
# Redmine is using '>' for blockquote, which is not textile
text = text.replace("> ", ">")
# wiki note macros
text = re.sub(self.regexTipMacro, r'---\n**TIP**: \1\n---\n', text, re.MULTILINE | re.DOTALL)
text = re.sub(self.regexNoteMacro, r'---\n**NOTE**: \1\n---\n', text, re.MULTILINE | re.DOTALL)
text = re.sub(self.regexWarningMacro, r'---\n**WARNING**: \1\n---\n', text, re.MULTILINE | re.DOTALL)
text = re.sub(self.regexImportantMacro, r'---\n**IMPORTANT**: \1\n---\n', text, re.MULTILINE | re.DOTALL)
# all other macros
text = re.sub(self.regexAnyMacro, r'\1', text, re.MULTILINE | re.DOTALL)
# attachments in notes
text = re.sub(self.regexAttachment, r"\n\n*(Merged from Redmine, please check first note for attachment named **\1**)*", text, re.MULTILINE | re.DOTALL)
# code highlight
codeHighlights = re.findall(self.regexCodeHighlight, text)
if len(codeHighlights) > 0:
for i in range(0, len(codeHighlights)):
text = text.replace(codeHighlights[i][0], "\n```{}".format(codeHighlights[i][2].lower()))
text = text.replace(codeHighlights[i][3], "\n```")
except RuntimeError as e:
return False
return text
示例30
def convert_ipynb_to_gallery(nb, new_file):
python_file = ""
nb_dict = json.load(open(nb, encoding="utf8", errors='ignore'))
cells = nb_dict['cells']
for i, cell in enumerate(cells):
if i == 0:
if cell['cell_type'] != 'markdown':
rst_source = os.path.basename(file_name[:-5])
rst_source = bytes(rst_source, 'utf-8').decode('utf-8', 'ignore')
python_file = '"""\n' + rst_source + '\n"""'
source = ''.join(cell['source'])
python_file = python_file + '\n' * 2 + source
else:
b = cell['source']
print(b)
a = bytes(cell['source'][0], 'utf-8').decode('utf-8', 'ignore')
print(a)
md_source = ''.join(a)
rst_source = pdoc.convert_text(md_source, 'rst', 'md')
print(rst_source)
rst_source = bytes(rst_source, 'utf-8').decode('utf-8', 'ignore')
python_file = '"""\n' + rst_source + '\n"""'
else:
if cell['cell_type'] == 'markdown':
md_source = ''.join(cell['source'])
rst_source = pdoc.convert_text(md_source, 'rst', 'md')
rst_source = rst_source.encode().decode('utf-8', 'ignore')
commented_source = '\n'.join(['# ' + x for x in
rst_source.split('\n')])
#python_file = python_file + '\n\n\n' + '#' * 70 + '\n' + \
# commented_source
python_file = python_file + '\n\n\n' + '# %%' + '\n' + \
commented_source
elif cell['cell_type'] == 'code':
source = ''.join(cell['source'])
python_file = python_file + '\n' * 2 + '# %% \n' + source
python_file = python_file.replace("\n%", "\n# %")
open(new_file, 'w', newline='', errors='ignore').write(python_file)
#%%