@inproceedings{10.1145/3563657.3596138, author = {Zamfirescu-Pereira, J.D. and Wei, Heather and Xiao, Amy and Gu, Kitty and Jung, Grace and Lee, Matthew G and Hartmann, Bjoern and Yang, Qian}, title = {Herding AI Cats: Lessons from Designing a Chatbot by Prompting GPT-3}, year = {2023}, isbn = {9781450398930}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/3563657.3596138}, doi = {10.1145/3563657.3596138}, abstract = {Prompting Large Language Models (LLMs) is an exciting new approach to designing chatbots. But can it improve LLM’s user experience (UX) reliably enough to power chatbot products? Our attempt to design a robust chatbot by prompting GPT-3/4 alone suggests: not yet. Prompts made achieving “80\%” UX goals easy, but not the remaining 20\%. Fixing the few remaining interaction breakdowns resembled herding cats: We could not address one UX issue or test one design solution at a time; instead, we had to handle everything everywhere all at once. Moreover, because no prompt could make GPT reliably say “I don’t know” when it should, the user-GPT conversations had no guardrails after a breakdown occurred, often leading to UX downward spirals. These risks incentivized us to design highly prescriptive prompts and scripted bots, counter to the promises of LLM-powered chatbots. This paper describes this case study, unpacks prompting’s fickleness and its impact on UX design processes, and discusses implications for LLM-based design methods and tools.}, booktitle = {Proceedings of the 2023 ACM Designing Interactive Systems Conference}, pages = {2206–2220}, numpages = {15}, keywords = {UX, conversational user interface, Prompt engineering, GPT.}, location = {Pittsburgh, PA, USA}, series = {DIS '23} }