Kısa cevap no.
Daha uzun cevap kolay değil.
Hala bs3 kullanıyorum, bu bir bck3 için bck olduğunu. Bunu bs4'e taşıyarak uzağım. Bu, esas olarak Tag ve BeautifulSoup alt sınıflarını ve güzelleştirici (ve ilgili) yöntemleri aşırı yüklemeyi içerir.
Kodu:
import sys
import BeautifulSoup
class Tag(BeautifulSoup.Tag):
def __str__(self, encoding=BeautifulSoup.DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0, pprint_exs=[]):
"""Returns a string or Unicode representation of this tag and
its contents. To get Unicode, pass None for encoding.
NOTE: since Python's HTML parser consumes whitespace, this
method is not certain to reproduce the whitespace present in
the original string."""
encodedName = self.toEncoding(self.name, encoding)
unflatten_here = (not self.name in pprint_exs)
attrs = []
if self.attrs:
for key, val in self.attrs:
fmt = '%s="%s"'
if isinstance(val, basestring):
if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
val = self.substituteEncoding(val, encoding)
# The attribute value either:
#
# * Contains no embedded double quotes or single quotes.
# No problem: we enclose it in double quotes.
# * Contains embedded single quotes. No problem:
# double quotes work here too.
# * Contains embedded double quotes. No problem:
# we enclose it in single quotes.
# * Embeds both single _and_ double quotes. This
# can't happen naturally, but it can happen if
# you modify an attribute value after parsing
# the document. Now we have a bit of a
# problem. We solve it by enclosing the
# attribute in single quotes, and escaping any
# embedded single quotes to XML entities.
if '"' in val:
fmt = "%s='%s'"
if "'" in val:
# TODO: replace with apos when
# appropriate.
val = val.replace("'", "&squot;")
# Now we're okay w/r/t quotes. But the attribute
# value might also contain angle brackets, or
# ampersands that aren't part of entities. We need
# to escape those to XML entities too.
val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
attrs.append(fmt % (self.toEncoding(key, encoding),
self.toEncoding(val, encoding)))
close = ''
closeTag = ''
if self.isSelfClosing:
close = ' /'
else:
closeTag = '</%s>' % encodedName
prev = self.findPrevious(lambda x: isinstance(x, Tag))
prev_sib = self.findPreviousSibling(lambda x: isinstance(x, Tag))
ex_break_detected = (self.name != prev_sib.name) if(prev_sib and prev_sib.name in pprint_exs) else False
break_detected = (self.name != prev.name) if(prev) else False
indentTag, indentContents = 0, 0
if prettyPrint:
if(break_detected or unflatten_here):
indentContents = indentLevel + 1
indentTag = indentLevel
space = (' ' * (indentTag-1))
contents = self.renderContents(encoding, prettyPrint, indentContents, pprint_exs, unflatten_here)
if self.hidden:
s = contents
else:
s = []
attributeString = ''
if attrs:
attributeString = ' ' + ' '.join(attrs)
if prettyPrint and ex_break_detected and not unflatten_here:
s.append("\n")
if prettyPrint and (unflatten_here or break_detected):
s.append(space)
s.append('<%s%s%s>' % (encodedName, attributeString, close))
if prettyPrint and unflatten_here:
s.append("\n")
s.append(contents)
if prettyPrint and contents and contents[-1] != "\n" and unflatten_here:
s.append("\n")
if prettyPrint and closeTag and unflatten_here:
s.append(space)
s.append(closeTag)
if prettyPrint and closeTag and self.nextSibling and unflatten_here:
s.append("\n")
if prettyPrint and isinstance(self.nextSibling, Tag) and self.nextSibling.name != self.name and not unflatten_here:
s.append("\n")
s = ''.join(s)
return s
def renderContents(self, encoding=BeautifulSoup.DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0, pprint_exs=[], unflatten=True):
"""Renders the contents of this tag as a string in the given
encoding. If encoding is None, returns a Unicode string.."""
s=[]
for c in self:
text = None
if isinstance(c, BeautifulSoup.NavigableString):
text = c.__str__(encoding)
elif isinstance(c, Tag):
s.append(c.__str__(encoding, prettyPrint, indentLevel, pprint_exs))
if text and prettyPrint:
text = text.strip()
if text:
if prettyPrint and unflatten:
s.append(" " * (indentLevel-1))
s.append(text)
if prettyPrint and unflatten:
s.append("\n")
return ''.join(s)
BeautifulSoup.Tag = Tag
class BeautifulStoneSoup(Tag, BeautifulSoup.BeautifulStoneSoup):
pass
BeautifulSoup.BeautifulStoneSoup = BeautifulStoneSoup
class PumpkinSoup(BeautifulStoneSoup, BeautifulSoup.BeautifulSoup):
def __init__(self, *args, **kwargs):
self.pprint_exs = kwargs.pop("pprint_exs", [])
super(BeautifulSoup.BeautifulSoup, self).__init__(*args, **kwargs)
def prettify(self, encoding=BeautifulSoup.DEFAULT_OUTPUT_ENCODING):
return self.__str__(encoding, True, pprint_exs=self.pprint_exs)
doc = \
'''
<div>
<div>
<span>a</span><span>b</span>
<a>link1</a>
<a>link2</a>
<span>c</span>
</div>
<a>link3</a><a>link4</a>
</div>
'''
soup = PumpkinSoup(doc, pprint_exs = ["a", "span"])
print soup.prettify()
sadece neye bakıyordum! ayrıca, "11" gün ... sadece bir aylık: D –
Bu bilgi için teşekkür ederiz! : D Yayınınla ilgili yorumumu kaldırmak için gönderiyi düzenleyeceğim. –
Orijinal işaretin javascript içermesi (aslında parantez) bu işe yaramaz. Bu, şaşırtıcı değil, 'format' kullanırken' KeyError's yapar. –