Try an interactive version of this dialog: Sign up at solve.it.com, click Upload, and pass this URL.
def html2ft(html, attr1st=False):
"""Convert HTML to an `ft` expression"""
rev_map = {'class': 'cls', 'for': 'fr'}
def _parse(elm, lvl=0, indent=4):
if isinstance(elm, str): return repr(elm.strip()) if elm.strip() else ''
if isinstance(elm, list): return '\n'.join(_parse(o, lvl) for o in elm)
tag_name = elm.name.capitalize().replace("-", "_")
if tag_name=='[document]': return _parse(list(elm.children), lvl)
cts = elm.contents
cs = [repr(c.strip()) if isinstance(c, str) else _parse(c, lvl+1)
for c in cts if str(c).strip()]
attrs, exotic_attrs = [], {}
for key, value in sorted(elm.attrs.items(), key=lambda x: x[0]=='class'):
if value is None or value == True: value = True # handle boolean attributes
elif isinstance(value,(tuple,list)): value = " ".join(value)
key, value = rev_map.get(key, key), value or True
if _re_h2x_attr_key.match(key): attrs.append(f'{key.replace("-", "_")}={value!r}')
else: exotic_attrs[key] = value
if exotic_attrs: attrs.append(f'**{exotic_attrs!r}')
spc = " "*lvl*indent
onlychild = not cts or (len(cts)==1 and isinstance(cts[0],str))
j = ', ' if onlychild else f',\n{spc}'
inner = j.join(filter(None, cs+attrs))
if onlychild:
if not attr1st: return f'{tag_name}({inner})'
else:
# respect attr1st setting
attrs = ', '.join(filter(None, attrs))
return f'{tag_name}({attrs})({cs[0] if cs else ""})'
if not attr1st or not attrs: return f'{tag_name}(\n{spc}{inner}\n{" "*(lvl-1)*indent})'
inner_cs = j.join(filter(None, cs))
inner_attrs = ', '.join(filter(None, attrs))
return f'{tag_name}({inner_attrs})(\n{spc}{inner_cs}\n{" "*(lvl-1)*indent})'
soup = BeautifulSoup(html.strip(), 'html.parser')
for c in soup.find_all(string=risinstance(Comment)): c.extract()
return _parse(soup, 1)
File: /usr/local/lib/python3.12/site-packages/fasthtml/components.py
Yes! You can use isinstance() to check if a child is a string. Try this:
isinstance(list(d.children)[0], str)
This will return True for the first child ("Howdy ") and False for the second child (the <a> tag).
Would you like to see how to loop through all children and identify which ones are strings?
Looking at the code, the issue is in the _parse function where it handles string elements:
if isinstance(elm, str): return repr(elm.strip()) if elm.strip() else ''
The .strip() removes both leading and trailing whitespace from each text node. So ' how are you? ' becomes 'how are you?'.
A couple of approaches to consider:
- Only strip trailing whitespace but preserve leading spaces
- Don't strip at all and let the output preserve original spacing
- Normalize whitespace more intelligently (collapse multiple spaces but preserve single spaces between elements)
Which approach would work best for your use case? Are you trying to preserve the exact spacing from the HTML, or do you want something more normalized?
Good question! Yes, there are cases where stripping makes sense:
- Whitespace from HTML formatting - When HTML is indented for readability, you often get text nodes that are just newlines and spaces between tags that you don't want in the output
- Empty text nodes - Purely whitespace nodes between elements that have no semantic meaning
But you probably want to preserve meaningful spaces - like the space between words when text and tags are inline.
The tricky part is distinguishing between formatting whitespace and meaningful whitespace. HTML itself does this by collapsing multiple whitespace characters into one space.
Do you want to handle this more like HTML does (collapse whitespace but preserve spaces between inline content), or do you have a different goal in mind?
def html2ft_nostrip(html, attr1st=False):
"""Convert HTML to an `ft` expression"""
rev_map = {'class': 'cls', 'for': 'fr'}
def _parse(elm, lvl=0, indent=4):
if isinstance(elm, str): return repr(elm.strip()) if elm.strip() else ''
if isinstance(elm, list): return '\n'.join(_parse(o, lvl) for o in elm)
tag_name = elm.name.capitalize().replace("-", "_")
if tag_name=='[document]': return _parse(list(elm.children), lvl)
cts = elm.contents
cs = [repr(c.strip("\n")) if isinstance(c, str) else _parse(c, lvl+1)
for c in cts if str(c).strip()]
attrs, exotic_attrs = [], {}
for key, value in sorted(elm.attrs.items(), key=lambda x: x[0]=='class'):
if value is None or value == True: value = True # handle boolean attributes
elif isinstance(value,(tuple,list)): value = " ".join(value)
key, value = rev_map.get(key, key), value or True
if _re_h2x_attr_key.match(key): attrs.append(f'{key.replace("-", "_")}={value!r}')
else: exotic_attrs[key] = value
if exotic_attrs: attrs.append(f'**{exotic_attrs!r}')
spc = " "*lvl*indent
onlychild = not cts or (len(cts)==1 and isinstance(cts[0],str))
j = ', ' if onlychild else f',\n{spc}'
inner = j.join(filter(None, cs+attrs))
if onlychild:
if not attr1st: return f'{tag_name}({inner})'
else:
# respect attr1st setting
attrs = ', '.join(filter(None, attrs))
return f'{tag_name}({attrs})({cs[0] if cs else ""})'
if not attr1st or not attrs: return f'{tag_name}(\n{spc}{inner}\n{" "*(lvl-1)*indent})'
inner_cs = j.join(filter(None, cs))
inner_attrs = ', '.join(filter(None, attrs))
return f'{tag_name}({inner_attrs})(\n{spc}{inner_cs}\n{" "*(lvl-1)*indent})'
soup = BeautifulSoup(html.strip(), 'html.parser')
for c in soup.find_all(string=risinstance(Comment)): c.extract()
return _parse(soup, 1)
In HTML is it correct to strip out most of the whitespace? Are there formal rules for where it should and shouldn't be removed?
Yes, HTML has formal rules for whitespace handling! The key concept is that HTML collapses whitespace in most contexts:
- Multiple whitespace characters (spaces, tabs, newlines) are collapsed into a single space
- Leading and trailing whitespace within block-level elements is typically ignored
- Whitespace between inline elements is preserved as a single space
There are important exceptions:
<pre>tags preserve all whitespace exactly<textarea>preserves whitespace- CSS
white-spaceproperty can change behavior - Certain elements have special rules
So the original html2ft function's approach of stripping is closer to how browsers render HTML, but it's a bit aggressive. A more accurate approach would collapse consecutive whitespace into single spaces rather than stripping entirely.
Would you like to explore how to implement proper whitespace collapsing that matches HTML's behavior?